summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/afs_vl.h9
-rw-r--r--fs/afs/file.c1
-rw-r--r--fs/afs/fs_operation.c2
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/rotate.c11
-rw-r--r--fs/bcachefs/Kconfig7
-rw-r--r--fs/bcachefs/Makefile1
-rw-r--r--fs/bcachefs/acl.c2
-rw-r--r--fs/bcachefs/alloc_background.c45
-rw-r--r--fs/bcachefs/alloc_background.h3
-rw-r--r--fs/bcachefs/alloc_foreground.c59
-rw-r--r--fs/bcachefs/alloc_foreground.h5
-rw-r--r--fs/bcachefs/backpointers.c108
-rw-r--r--fs/bcachefs/backpointers.h23
-rw-r--r--fs/bcachefs/bcachefs.h17
-rw-r--r--fs/bcachefs/bcachefs_format.h10
-rw-r--r--fs/bcachefs/bkey.h8
-rw-r--r--fs/bcachefs/bkey_methods.c2
-rw-r--r--fs/bcachefs/bkey_methods.h2
-rw-r--r--fs/bcachefs/bset.c182
-rw-r--r--fs/bcachefs/bset.h4
-rw-r--r--fs/bcachefs/btree_cache.c273
-rw-r--r--fs/bcachefs/btree_cache.h3
-rw-r--r--fs/bcachefs/btree_gc.c29
-rw-r--r--fs/bcachefs/btree_io.c14
-rw-r--r--fs/bcachefs/btree_io.h4
-rw-r--r--fs/bcachefs/btree_iter.c63
-rw-r--r--fs/bcachefs/btree_iter.h52
-rw-r--r--fs/bcachefs/btree_key_cache.c405
-rw-r--r--fs/bcachefs/btree_key_cache_types.h18
-rw-r--r--fs/bcachefs/btree_locking.h13
-rw-r--r--fs/bcachefs/btree_node_scan.c2
-rw-r--r--fs/bcachefs/btree_trans_commit.c110
-rw-r--r--fs/bcachefs/btree_types.h60
-rw-r--r--fs/bcachefs/btree_update.c12
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/btree_update_interior.c37
-rw-r--r--fs/bcachefs/btree_update_interior.h2
-rw-r--r--fs/bcachefs/buckets.c35
-rw-r--r--fs/bcachefs/buckets.h15
-rw-r--r--fs/bcachefs/buckets_types.h8
-rw-r--r--fs/bcachefs/chardev.c1
-rw-r--r--fs/bcachefs/checksum.c101
-rw-r--r--fs/bcachefs/clock.h9
-rw-r--r--fs/bcachefs/darray.c4
-rw-r--r--fs/bcachefs/darray.h26
-rw-r--r--fs/bcachefs/data_update.c4
-rw-r--r--fs/bcachefs/dirent.c66
-rw-r--r--fs/bcachefs/disk_accounting.c82
-rw-r--r--fs/bcachefs/disk_accounting.h29
-rw-r--r--fs/bcachefs/disk_accounting_types.h2
-rw-r--r--fs/bcachefs/ec.c303
-rw-r--r--fs/bcachefs/ec.h11
-rw-r--r--fs/bcachefs/ec_format.h9
-rw-r--r--fs/bcachefs/ec_types.h1
-rw-r--r--fs/bcachefs/errcode.h14
-rw-r--r--fs/bcachefs/error.c14
-rw-r--r--fs/bcachefs/error.h2
-rw-r--r--fs/bcachefs/extents.c33
-rw-r--r--fs/bcachefs/extents.h24
-rw-r--r--fs/bcachefs/fs-common.c5
-rw-r--r--fs/bcachefs/fs-io-buffered.c41
-rw-r--r--fs/bcachefs/fs-io-direct.c2
-rw-r--r--fs/bcachefs/fs-io-pagecache.c90
-rw-r--r--fs/bcachefs/fs-io-pagecache.h4
-rw-r--r--fs/bcachefs/fs-io.c178
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs.c427
-rw-r--r--fs/bcachefs/fs.h18
-rw-r--r--fs/bcachefs/fsck.c295
-rw-r--r--fs/bcachefs/inode.c14
-rw-r--r--fs/bcachefs/inode.h1
-rw-r--r--fs/bcachefs/io_read.c22
-rw-r--r--fs/bcachefs/io_write.c11
-rw-r--r--fs/bcachefs/journal_io.c8
-rw-r--r--fs/bcachefs/journal_reclaim.c7
-rw-r--r--fs/bcachefs/logged_ops.c13
-rw-r--r--fs/bcachefs/opts.c85
-rw-r--r--fs/bcachefs/opts.h61
-rw-r--r--fs/bcachefs/rcu_pending.c650
-rw-r--r--fs/bcachefs/rcu_pending.h27
-rw-r--r--fs/bcachefs/rebalance.c3
-rw-r--r--fs/bcachefs/recovery.c29
-rw-r--r--fs/bcachefs/recovery_passes.c10
-rw-r--r--fs/bcachefs/recovery_passes_types.h2
-rw-r--r--fs/bcachefs/reflink.c2
-rw-r--r--fs/bcachefs/replicas.c28
-rw-r--r--fs/bcachefs/replicas.h2
-rw-r--r--fs/bcachefs/replicas_format.h9
-rw-r--r--fs/bcachefs/sb-clean.c3
-rw-r--r--fs/bcachefs/sb-downgrade.c9
-rw-r--r--fs/bcachefs/sb-errors.c6
-rw-r--r--fs/bcachefs/sb-errors.h2
-rw-r--r--fs/bcachefs/sb-errors_format.h39
-rw-r--r--fs/bcachefs/sb-members.c57
-rw-r--r--fs/bcachefs/sb-members.h22
-rw-r--r--fs/bcachefs/six.c14
-rw-r--r--fs/bcachefs/snapshot.c3
-rw-r--r--fs/bcachefs/str_hash.h2
-rw-r--r--fs/bcachefs/subvolume.c54
-rw-r--r--fs/bcachefs/subvolume.h45
-rw-r--r--fs/bcachefs/subvolume_types.h3
-rw-r--r--fs/bcachefs/super-io.c19
-rw-r--r--fs/bcachefs/super.c85
-rw-r--r--fs/bcachefs/sysfs.c55
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/bcachefs/thread_with_file.c4
-rw-r--r--fs/bcachefs/time_stats.c14
-rw-r--r--fs/bcachefs/time_stats.h3
-rw-r--r--fs/bcachefs/trace.h465
-rw-r--r--fs/bcachefs/util.c16
-rw-r--r--fs/bcachefs/util.h2
-rw-r--r--fs/bcachefs/xattr.c81
-rw-r--r--fs/bcachefs/xattr_format.h2
-rw-r--r--fs/binfmt_elf.c48
-rw-r--r--fs/bpf_fs_kfuncs.c185
-rw-r--r--fs/btrfs/btrfs_inode.h1
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/defrag.c2
-rw-r--r--fs/btrfs/file.c34
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/cachefiles/namei.c7
-rw-r--r--fs/ceph/addr.c1
-rw-r--r--fs/ceph/caps.c29
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/mds_client.c25
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h7
-rw-r--r--fs/coredump.c107
-rw-r--r--fs/debugfs/file.c1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/efivarfs/file.c1
-rw-r--r--fs/eventfd.c4
-rw-r--r--fs/eventpoll.c30
-rw-r--r--fs/exec.c81
-rw-r--r--fs/exfat/balloc.c10
-rw-r--r--fs/exfat/exfat_fs.h24
-rw-r--r--fs/exfat/file.c110
-rw-r--r--fs/exfat/inode.c94
-rw-r--r--fs/exfat/namei.c17
-rw-r--r--fs/exfat/nls.c5
-rw-r--r--fs/exfat/super.c41
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/dir.c14
-rw-r--r--fs/ext4/ext4.h31
-rw-r--r--fs/ext4/extents.c823
-rw-r--r--fs/ext4/extents_status.c240
-rw-r--r--fs/ext4/extents_status.h28
-rw-r--r--fs/ext4/fast_commit.c47
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/ialloc.c35
-rw-r--r--fs/ext4/indirect.c7
-rw-r--r--fs/ext4/inline.c46
-rw-r--r--fs/ext4/inode.c292
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c25
-rw-r--r--fs/ext4/migrate.c7
-rw-r--r--fs/ext4/move_extent.c90
-rw-r--r--fs/ext4/namei.c16
-rw-r--r--fs/ext4/readpage.c16
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c65
-rw-r--r--fs/ext4/xattr.c31
-rw-r--r--fs/ext4/xattr.h7
-rw-r--r--fs/f2fs/checkpoint.c17
-rw-r--r--fs/f2fs/compress.c63
-rw-r--r--fs/f2fs/data.c164
-rw-r--r--fs/f2fs/debug.c2
-rw-r--r--fs/f2fs/dir.c8
-rw-r--r--fs/f2fs/extent_cache.c4
-rw-r--r--fs/f2fs/f2fs.h148
-rw-r--r--fs/f2fs/file.c205
-rw-r--r--fs/f2fs/gc.c113
-rw-r--r--fs/f2fs/gc.h29
-rw-r--r--fs/f2fs/inline.c31
-rw-r--r--fs/f2fs/inode.c9
-rw-r--r--fs/f2fs/namei.c68
-rw-r--r--fs/f2fs/node.c46
-rw-r--r--fs/f2fs/segment.c72
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c119
-rw-r--r--fs/f2fs/sysfs.c82
-rw-r--r--fs/f2fs/verity.c5
-rw-r--r--fs/f2fs/xattr.c14
-rw-r--r--fs/fcntl.c38
-rw-r--r--fs/fhandle.c4
-rw-r--r--fs/file.c26
-rw-r--r--fs/fsopen.c7
-rw-r--r--fs/fuse/Makefile3
-rw-r--r--fs/fuse/acl.c10
-rw-r--r--fs/fuse/control.c4
-rw-r--r--fs/fuse/dev.c221
-rw-r--r--fs/fuse/dir.c152
-rw-r--r--fs/fuse/file.c184
-rw-r--r--fs/fuse/fuse_i.h42
-rw-r--r--fs/fuse/fuse_trace.h132
-rw-r--r--fs/fuse/inode.c13
-rw-r--r--fs/fuse/passthrough.c7
-rw-r--r--fs/fuse/virtio_fs.c206
-rw-r--r--fs/gfs2/aops.c30
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c9
-rw-r--r--fs/gfs2/log.c12
-rw-r--r--fs/gfs2/meta_io.c24
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/inode.c8
-rw-r--r--fs/ioctl.c30
-rw-r--r--fs/iomap/buffered-io.c199
-rw-r--r--fs/iomap/direct-io.c42
-rw-r--r--fs/isofs/rock.h2
-rw-r--r--fs/jbd2/checkpoint.c21
-rw-r--r--fs/jbd2/journal.c97
-rw-r--r--fs/kernel_read_file.c4
-rw-r--r--fs/lockd/host.c2
-rw-r--r--fs/lockd/svc.c9
-rw-r--r--fs/locks.c14
-rw-r--r--fs/mnt_idmapping.c22
-rw-r--r--fs/namei.c10
-rw-r--r--fs/namespace.c22
-rw-r--r--fs/netfs/buffered_write.c1
-rw-r--r--fs/netfs/internal.h1
-rw-r--r--fs/netfs/misc.c74
-rw-r--r--fs/netfs/write_issue.c24
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/callback.c2
-rw-r--r--fs/nfs/client.c21
-rw-r--r--fs/nfs/dir.c6
-rw-r--r--fs/nfs/filelayout/filelayout.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c56
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c6
-rw-r--r--fs/nfs/fs_context.c8
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/inode.c53
-rw-r--r--fs/nfs/internal.h54
-rw-r--r--fs/nfs/localio.c757
-rw-r--r--fs/nfs/nfs2xdr.c70
-rw-r--r--fs/nfs/nfs3xdr.c108
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4proc.c16
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/nfs4xdr.c101
-rw-r--r--fs/nfs/nfstrace.h61
-rw-r--r--fs/nfs/pagelist.c16
-rw-r--r--fs/nfs/pnfs_nfs.c2
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/super.c3
-rw-r--r--fs/nfs/write.c21
-rw-r--r--fs/nfs_common/Makefile5
-rw-r--r--fs/nfs_common/common.c134
-rw-r--r--fs/nfs_common/nfslocalio.c172
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/auth.c14
-rw-r--r--fs/nfsd/auth.h2
-rw-r--r--fs/nfsd/blocklayout.c6
-rw-r--r--fs/nfsd/blocklayoutxdr.h2
-rw-r--r--fs/nfsd/cache.h2
-rw-r--r--fs/nfsd/export.c67
-rw-r--r--fs/nfsd/export.h7
-rw-r--r--fs/nfsd/filecache.c137
-rw-r--r--fs/nfsd/filecache.h6
-rw-r--r--fs/nfsd/localio.c169
-rw-r--r--fs/nfsd/netns.h19
-rw-r--r--fs/nfsd/nfs3proc.c44
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4idmap.c13
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c69
-rw-r--r--fs/nfsd/nfs4recover.c13
-rw-r--r--fs/nfsd/nfs4state.c219
-rw-r--r--fs/nfsd/nfs4xdr.c29
-rw-r--r--fs/nfsd/nfsctl.c46
-rw-r--r--fs/nfsd/nfsd.h50
-rw-r--r--fs/nfsd/nfsfh.c185
-rw-r--r--fs/nfsd/nfsfh.h4
-rw-r--r--fs/nfsd/nfsproc.c49
-rw-r--r--fs/nfsd/nfssvc.c231
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/trace.h145
-rw-r--r--fs/nfsd/vfs.c45
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/bmap.h20
-rw-r--r--fs/nilfs2/btnode.c63
-rw-r--r--fs/nilfs2/btree.c12
-rw-r--r--fs/nilfs2/btree.h1
-rw-r--r--fs/nilfs2/cpfile.c54
-rw-r--r--fs/nilfs2/dat.c17
-rw-r--r--fs/nilfs2/dir.c44
-rw-r--r--fs/nilfs2/inode.c79
-rw-r--r--fs/nilfs2/ioctl.c109
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/nilfs.h27
-rw-r--r--fs/nilfs2/page.c21
-rw-r--r--fs/nilfs2/page.h4
-rw-r--r--fs/nilfs2/recovery.c11
-rw-r--r--fs/nilfs2/segment.c234
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/sufile.c52
-rw-r--r--fs/nilfs2/super.c9
-rw-r--r--fs/nilfs2/the_nilfs.c5
-rw-r--r--fs/nilfs2/the_nilfs.h6
-rw-r--r--fs/notify/fanotify/fanotify_user.c12
-rw-r--r--fs/notify/inotify/inotify_user.c12
-rw-r--r--fs/nsfs.c1
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/buffer_head_io.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c6
-rw-r--r--fs/ocfs2/dir.c12
-rw-r--r--fs/ocfs2/dlmglue.c7
-rw-r--r--fs/ocfs2/extent_map.c8
-rw-r--r--fs/ocfs2/journal.c7
-rw-r--r--fs/ocfs2/localalloc.c19
-rw-r--r--fs/ocfs2/quota_global.c15
-rw-r--r--fs/ocfs2/quota_local.c8
-rw-r--r--fs/ocfs2/refcounttree.c39
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/ocfs2/xattr.c26
-rw-r--r--fs/open.c24
-rw-r--r--fs/orangefs/orangefs-sysfs.c14
-rw-r--r--fs/overlayfs/file.c68
-rw-r--r--fs/pidfs.c5
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/inode.c31
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/page.c11
-rw-r--r--fs/proc/proc_sysctl.c11
-rw-r--r--fs/proc/task_mmu.c15
-rw-r--r--fs/pstore/platform.c10
-rw-r--r--fs/quota/dquot.c14
-rw-r--r--fs/quota/quota.c8
-rw-r--r--fs/quota/quota_v1.c3
-rw-r--r--fs/quota/quota_v2.c9
-rw-r--r--fs/read_write.c118
-rw-r--r--fs/readdir.c20
-rw-r--r--fs/remap_range.c2
-rw-r--r--fs/select.c12
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/smb/client/cifsencrypt.c151
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h5
-rw-r--r--fs/smb/client/cifsproto.h12
-rw-r--r--fs/smb/client/connect.c66
-rw-r--r--fs/smb/client/dfs.c73
-rw-r--r--fs/smb/client/dfs.h42
-rw-r--r--fs/smb/client/dfs_cache.c218
-rw-r--r--fs/smb/client/fs_context.h1
-rw-r--r--fs/smb/client/inode.c19
-rw-r--r--fs/smb/client/ioctl.c8
-rw-r--r--fs/smb/client/misc.c6
-rw-r--r--fs/smb/client/namespace.c2
-rw-r--r--fs/smb/client/reparse.c10
-rw-r--r--fs/smb/client/reparse.h9
-rw-r--r--fs/smb/client/sess.c2
-rw-r--r--fs/smb/client/smb2misc.c28
-rw-r--r--fs/smb/client/smb2ops.c56
-rw-r--r--fs/smb/client/smb2pdu.c32
-rw-r--r--fs/smb/client/smb2proto.h2
-rw-r--r--fs/smb/client/smb2transport.c32
-rw-r--r--fs/smb/client/trace.h6
-rw-r--r--fs/smb/client/transport.c3
-rw-r--r--fs/smb/common/smb2pdu.h6
-rw-r--r--fs/smb/server/connection.c2
-rw-r--r--fs/smb/server/ksmbd_netlink.h2
-rw-r--r--fs/smb/server/oplock.c4
-rw-r--r--fs/smb/server/server.c2
-rw-r--r--fs/smb/server/smb2pdu.c35
-rw-r--r--fs/smb/server/smb2pdu.h4
-rw-r--r--fs/smb/server/smb_common.c2
-rw-r--r--fs/smb/server/vfs_cache.h4
-rw-r--r--fs/smb/server/xattr.h2
-rw-r--r--fs/splice.c22
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c6
-rw-r--r--fs/stat.c8
-rw-r--r--fs/statfs.c4
-rw-r--r--fs/sync.c14
-rw-r--r--fs/timerfd.c8
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/userfaultfd.c171
-rw-r--r--fs/utimes.c4
-rw-r--r--fs/xattr.c36
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c5
-rw-r--r--fs/xfs/libxfs/xfs_shared.h3
-rw-r--r--fs/xfs/scrub/xfile.c6
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_exchrange.c8
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_handle.c6
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_iomap.c19
-rw-r--r--fs/xfs/xfs_iops.c12
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/zonefs/file.c2
-rw-r--r--fs/zonefs/sysfs.c1
407 files changed, 10641 insertions, 6444 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a46b0cbc4d8f..949895cff872 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -288,6 +288,10 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
+config HUGETLB_PMD_PAGE_TABLE_SHARING
+ def_bool HUGETLB_PAGE
+ depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS
+
config ARCH_HAS_GIGANTIC_PAGE
bool
@@ -382,6 +386,29 @@ config NFS_COMMON
depends on NFSD || NFS_FS || LOCKD
default y
+config NFS_COMMON_LOCALIO_SUPPORT
+ tristate
+ default n
+ default y if NFSD=y || NFS_FS=y
+ default m if NFSD=m && NFS_FS=m
+ select SUNRPC
+
+config NFS_LOCALIO
+ bool "NFS client and server support for LOCALIO auxiliary protocol"
+ depends on NFSD && NFS_FS
+ select NFS_COMMON_LOCALIO_SUPPORT
+ default n
+ help
+ Some NFS servers support an auxiliary NFS LOCALIO protocol
+ that is not an official part of the NFS protocol.
+
+ This option enables support for the LOCALIO protocol in the
+ kernel's NFS server and client. Enable this to permit local
+ NFS clients to bypass the network when issuing reads and
+ writes to the local NFS server.
+
+ If unsure, say N.
+
config NFS_V4_2_SSC_HELPER
bool
default y if NFS_V4_2
diff --git a/fs/Makefile b/fs/Makefile
index 6ecc9b0a53f2..61679fd587b7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -129,3 +129,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/
obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
+obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 9c65ffb8a523..a06296c8827d 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -134,13 +134,4 @@ struct afs_uvldbentry__xdr {
__be32 spares9;
};
-struct afs_address_list {
- refcount_t usage;
- unsigned int version;
- unsigned int nr_addrs;
- struct sockaddr_rxrpc addrs[];
-};
-
-extern void afs_put_address_list(struct afs_address_list *alist);
-
#endif /* AFS_VL_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 492d857a3fa0..6762eff97517 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -420,6 +420,7 @@ const struct netfs_request_ops afs_req_ops = {
.begin_writeback = afs_begin_writeback,
.prepare_write = afs_prepare_write,
.issue_write = afs_issue_write,
+ .retry_request = afs_retry_request,
};
static void afs_add_open_mmap(struct afs_vnode *vnode)
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 3546b087e791..428721bbe4f6 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -201,7 +201,7 @@ void afs_wait_for_operation(struct afs_operation *op)
}
}
- if (op->call_responded)
+ if (op->call_responded && op->server)
set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
if (!afs_op_error(op)) {
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 580de4adaaf6..b516d05b0fef 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -506,10 +506,10 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (estate->responsive_set & ~exclude)
- return 1;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
+ if (estate->responsive_set & ~exclude)
+ return 1;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
if (timo == 0)
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ed09d4d4c211..d612983d6f38 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -632,8 +632,10 @@ iterate_address:
wait_for_more_probe_results:
error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
!(op->flags & AFS_OPERATION_UNINTR));
- if (!error)
+ if (error == 1)
goto iterate_address;
+ if (!error)
+ goto restart_from_beginning;
/* We've now had a failure to respond on all of a server's addresses -
* immediately probe them again and consider retrying the server.
@@ -644,10 +646,13 @@ wait_for_more_probe_results:
error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
!(op->flags & AFS_OPERATION_UNINTR));
switch (error) {
- case 0:
+ case 1:
op->flags &= ~AFS_OPERATION_RETRY_SERVER;
- trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 1);
goto retry_server;
+ case 0:
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+ goto restart_from_beginning;
case -ERESTARTSYS:
afs_op_set_error(op, error);
goto failed;
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a..5bac803ea367 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
is held by another thread, spin for a short while, as long as the
thread owning the lock is running.
+config BCACHEFS_PATH_TRACEPOINTS
+ bool "Extra btree_path tracepoints"
+ depends on BCACHEFS_FS
+ help
+ Enable extra tracepoints for debugging btree_path operations; we don't
+ normally want these enabled because they happen at very high rates.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0ab533a2b03b..56d20e219f59 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -69,6 +69,7 @@ bcachefs-y := \
printbuf.o \
quota.o \
rebalance.o \
+ rcu_pending.o \
recovery.o \
recovery_passes.o \
reflink.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 331a17f3f113..87f1be9d4db4 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -361,7 +361,7 @@ retry:
bch2_trans_begin(trans);
acl = _acl;
- ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_intent);
if (ret)
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index dc3a4024aab6..645b5ed4babb 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
+#include <linux/jiffies.h>
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
@@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
- if (last_updated + HZ * 10 < jiffies) {
+ if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;
@@ -2297,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c)
return 0;
}
+/* device removal */
+
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+ struct bpos start = POS(ca->dev_idx, 0);
+ struct bpos end = POS(ca->dev_idx, U64_MAX);
+ int ret;
+
+ /*
+ * We clear the LRU and need_discard btrees first so that we don't race
+ * with bch2_do_invalidates() and bch2_do_discards()
+ */
+ ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
+ bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_dev_usage_remove(c, ca->dev_idx);
+ bch_err_msg(ca, ret, "removing dev alloc info");
+ return ret;
+}
+
/* Bucket IO clocks: */
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
@@ -2432,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
/* device goes ro: */
void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ c->rw_devs_change_count++;
+
/*
* Capacity is calculated based off of devices in allocation groups:
*/
@@ -2467,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
/* device goes rw: */
void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned i;
+ lockdep_assert_held(&c->state_lock);
- for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
if (ca->mi.data_allowed & (1 << i))
set_bit(ca->dev_idx, c->rw_devs[i].d);
+
+ c->rw_devs_change_count++;
}
void bch2_dev_allocator_background_exit(struct bch_dev *ca)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index fd790b03fbe1..f8e87c6721b1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -16,7 +16,7 @@ enum bch_validate_flags;
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, pos.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
bool ret = ca && bucket_valid(ca, pos.offset);
rcu_read_unlock();
return ret;
@@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
+int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 8563c2d26847..d0e0b56892e3 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
enum bch_watermark watermark,
enum bch_data_type data_type,
struct closure *cl,
+ bool nowait,
struct bch_dev_usage *usage)
{
struct bch_fs *c = trans->c;
@@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bucket_alloc_state s = {
.btree_bitmap = data_type == BCH_DATA_btree,
};
- bool waiting = false;
+ bool waiting = nowait;
again:
bch2_dev_usage_read_fast(ca, usage);
avail = dev_buckets_free(ca, *usage, watermark);
@@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, &usage)));
+ data_type, cl, false, &usage)));
return ob;
}
@@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
@@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- unsigned flags,
+ enum bch_write_flags flags,
enum bch_data_type data_type,
enum bch_watermark watermark,
struct closure *cl)
@@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage);
+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
+ cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
if (!IS_ERR(ob))
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
bch2_dev_put(ca);
@@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob)) {
+ have_cache, ob)) {
ret = 0;
break;
}
@@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
struct bch_fs *c = trans->c;
@@ -883,7 +884,7 @@ got_bucket:
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
out_put_head:
bch2_ec_stripe_head_put(c, h);
return ret;
@@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
- bool ec, unsigned flags)
+ bool ec)
{
struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
@@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c,
have_cache, ec, ob))
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
else
ob_push(c, &ptrs_skip, ob);
}
@@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache, bool ec,
- enum bch_watermark watermark,
- unsigned flags)
+ enum bch_watermark watermark)
{
int i, ret = 0;
@@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, flags, ob);
+ have_cache, ob);
if (ret)
break;
}
@@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *_cl)
{
struct bch_fs *c = trans->c;
@@ -1022,18 +1022,15 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
- if (erasure_code && ec_open_bucket(c, ptrs))
- return 0;
-
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, flags);
+ have_cache, erasure_code);
if (ret)
return ret;
ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
- have_cache, erasure_code, watermark, flags);
+ have_cache, erasure_code, watermark);
if (ret)
return ret;
@@ -1074,12 +1071,12 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
unsigned *nr_effective,
bool *have_cache,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl)
{
int ret;
- if (erasure_code) {
+ if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
ret = __open_bucket_add_buckets(trans, ptrs, wp,
devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,
@@ -1376,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum bch_watermark watermark,
- unsigned flags,
+ enum bch_write_flags flags,
struct closure *cl,
struct write_point **wp_ret)
{
@@ -1392,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
-
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@@ -1498,11 +1493,12 @@ err:
try_decrease_writepoints(trans, write_points_nr))
goto retry;
- if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
+ if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ ret = -BCH_ERR_bucket_alloc_blocked;
+
+ if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- return cl
- ? -BCH_ERR_bucket_alloc_blocked
- : -BCH_ERR_ENOSPC_bucket_alloc;
+ ret = -BCH_ERR_bucket_alloc_blocked;
return ret;
}
@@ -1733,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
- printbuf_tabstops_reset(out);
- printbuf_tabstop_push(out, 12);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
- printbuf_tabstop_push(out, 16);
-
bch2_dev_usage_to_text(out, ca, &stats);
prt_newline(out);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 386d231ceca3..1a16fd5bd4f8 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
return ret;
}
+enum bch_write_flags;
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, unsigned,
+ unsigned, unsigned *, bool *, enum bch_write_flags,
enum bch_data_type, enum bch_watermark,
struct closure *);
@@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct bch_devs_list *,
unsigned, unsigned,
enum bch_watermark,
- unsigned,
+ enum bch_write_flags,
struct closure *,
struct write_point **);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index d4da6343efa9..47455a85c909 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -9,6 +9,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "checksum.h"
+#include "disk_accounting.h"
#include "error.h"
#include <linux/mm.h>
@@ -53,7 +54,7 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
if (!ca) {
/* these will be caught by fsck */
rcu_read_unlock();
@@ -87,7 +88,7 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
if (ca) {
struct bpos bucket = bp_pos_to_bucket(ca, k.k->p);
rcu_read_unlock();
@@ -500,7 +501,7 @@ found:
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
bch2_bkey_val_to_text(&buf, c, extent2);
- struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
trans, dup_backpointer_to_bad_csum_extent,
@@ -671,7 +672,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
continue;
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
if (ca)
bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp);
rcu_read_unlock();
@@ -750,10 +751,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
+ bch2_btree_cache_unpin(c);
+
btree_interior_mask |= btree_leaf_mask;
- c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
- c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
+ c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
+ c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@@ -775,6 +778,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
+ bch2_node_pin(c, b);
0;
}));
}
@@ -782,12 +786,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+static inline void progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = i,
+ };
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
+ }
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
+
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
+ return ret;
+}
+
+static void progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
+ struct progress_indicator_state progress;
int ret = 0;
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
btree_id++) {
@@ -805,6 +877,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -865,8 +938,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@@ -920,19 +992,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
+ struct bch_fs *c = trans->c;
struct bkey_buf last_flushed;
+ struct progress_indicator_state progress;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
+ progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed));
-
- bch2_bkey_buf_exit(&last_flushed, trans->c);
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ check_one_backpointer(trans, start, end,
+ bkey_s_c_to_backpointer(k),
+ &last_flushed);
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
return ret;
}
@@ -977,8 +1054,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- c->btree_cache.pinned_nodes_leaf_mask = 0;
- c->btree_cache.pinned_nodes_interior_mask = 0;
+ bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 7daecadb764e..3b29fdf519dd 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -134,28 +134,37 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
}
}
-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k, struct extent_ptr_decoded p,
const union bch_extent_entry *entry,
- struct bpos *bucket_pos, struct bch_backpointer *bp)
+ struct bpos *bucket_pos, struct bch_backpointer *bp,
+ u64 sectors)
{
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- s64 sectors = level ? btree_sectors(c) : k.k->size;
u32 bucket_offset;
-
*bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
*bp = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
- .data_type = data_type,
+ .data_type = bch2_bkey_ptr_data_type(k, p, entry),
.bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
p.crc.offset,
- .bucket_len = ptr_disk_sectors(sectors, p),
+ .bucket_len = sectors,
.pos = k.k->p,
};
}
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+ u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+
+ __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
+}
+
int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
struct bpos *, struct bch_backpointer *, unsigned);
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c7086e00d18..f4151ee51b03 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -542,7 +542,7 @@ struct bch_dev {
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
- struct bucket_array __rcu *buckets_gc;
+ GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
@@ -594,6 +594,7 @@ struct bch_dev {
#define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \
+ x(clean_recovery) \
x(btree_running) \
x(accounting_replay_done) \
x(may_go_rw) \
@@ -776,7 +777,7 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
@@ -871,6 +872,7 @@ struct bch_fs {
/* ALLOCATION */
struct bch_devs_mask rw_devs[BCH_DATA_NR];
+ unsigned long rw_devs_change_count;
u64 capacity; /* sectors */
u64 reserved; /* sectors */
@@ -1023,6 +1025,7 @@ struct bch_fs {
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
+ struct rhashtable vfs_inodes_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@@ -1044,8 +1047,6 @@ struct bch_fs {
* for signaling to the toplevel code which pass we want to run now.
*/
enum bch_recovery_pass curr_recovery_pass;
- /* bitmap of explicitly enabled recovery passes: */
- u64 recovery_passes_explicit;
/* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
/* never rewinds version of curr_recovery_pass */
@@ -1085,7 +1086,6 @@ struct bch_fs {
u64 __percpu *counters;
unsigned copy_gc_enabled:1;
- bool promote_whole_extents;
struct bch2_time_stats times[BCH_TIME_STAT_NR];
@@ -1195,12 +1195,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
+ s64 sec;
s32 rem;
time += c->sb.time_base_lo;
- t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
- t.tv_nsec = rem * c->sb.nsec_per_time_unit;
+ sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
+
+ set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit);
+
return t;
}
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 14ce726bf5a3..84832c2d4df9 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -217,13 +217,13 @@ struct bkey {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
- struct bversion version;
+ struct bversion bversion;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
- struct bversion version;
+ struct bversion bversion;
__u8 pad[1];
#endif
@@ -328,8 +328,8 @@ enum bch_bkey_fields {
bkey_format_field(OFFSET, p.offset), \
bkey_format_field(SNAPSHOT, p.snapshot), \
bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, version.hi), \
- bkey_format_field(VERSION_LO, version.lo), \
+ bkey_format_field(VERSION_HI, bversion.hi), \
+ bkey_format_field(VERSION_LO, bversion.lo), \
}, \
})
@@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+ struct bch_sb, flags[0], 63, 64);
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index e34cb2bf329c..41df24a53d97 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-static __always_inline int bversion_zero(struct bversion v)
+static __always_inline bool bversion_zero(struct bversion v)
{
- return !bversion_cmp(v, ZERO_VERSION);
+ return bversion_cmp(v, ZERO_VERSION) == 0;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {}
x(BKEY_FIELD_OFFSET, p.offset) \
x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
+ x(BKEY_FIELD_VERSION_HI, bversion.hi) \
+ x(BKEY_FIELD_VERSION_LO, bversion.lo)
struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS];
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 88d8958281e8..e7ac227ba7e8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
bch2_bpos_to_text(out, k->p);
- prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
} else {
prt_printf(out, "(null)");
}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3df3dd2723a1..018fb72e32d3 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{
return l->type == r->type &&
- !bversion_cmp(l->version, r->version) &&
+ !bversion_cmp(l->bversion, r->bversion) &&
bpos_eq(l->p, bkey_start_pos(r));
}
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 575e1d0b6eeb..d1f6092624d8 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -304,11 +304,6 @@ struct bkey_float {
};
#define BKEY_MANTISSA_BITS 16
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
- return idx * sizeof(struct bkey_float);
-}
-
struct ro_aux_tree {
u8 nothing[0];
struct bkey_float f[];
@@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
- DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
- t->size * sizeof(u8), 8);
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
return __aux_tree_base(b, t);
}
-static u8 *ro_aux_tree_prev(const struct btree *b,
- const struct bset_tree *t)
-{
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
@@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
bkey_float(b, t, j)->key_offset);
}
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
- const struct bset_tree *t,
- unsigned j)
-{
- unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
- return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
const struct bset_tree *t)
{
@@ -585,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
}
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
u64 v;
@@ -617,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
struct bkey_packed *m = tree_to_bkey(b, t, j);
struct bkey_packed *l = is_power_of_2(j)
? min_key
- : tree_to_prev_bkey(b, t, j >> ffs(j));
+ : tree_to_bkey(b, t, j >> ffs(j));
struct bkey_packed *r = is_power_of_2(j + 1)
? max_key
: tree_to_bkey(b, t, j >> (ffz(j) + 1));
@@ -668,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
f->exponent = shift;
- mantissa = bkey_mantissa(m, f, j);
+ mantissa = bkey_mantissa(m, f);
/*
* If we've got garbage bits, set them to all 1s - it's legal for the
@@ -690,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
- return __bset_tree_capacity(b, t) /
- (sizeof(struct bkey_float) + sizeof(u8));
+ return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
}
static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
@@ -720,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
- struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+ struct bkey_packed *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
unsigned cacheline = 1;
@@ -733,12 +708,12 @@ retry:
return;
}
- t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+ t->extra = eytzinger1_extra(t->size - 1);
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_p_next(k);
+ k = bkey_p_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@@ -746,17 +721,12 @@ retry:
goto retry;
}
- ro_aux_tree_prev(b, t)[j] = prev->u64s;
bkey_float(b, t, j)->key_offset =
bkey_to_cacheline_offset(b, t, cacheline++, k);
- EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
EBUG_ON(tree_to_bkey(b, t, j) != k);
}
- while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_p_next(k);
-
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
min_key.k.p = b->data->min_key;
@@ -915,6 +885,38 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
+static void rw_aux_tree_insert_entry(struct btree *b,
+ struct bset_tree *t,
+ unsigned idx)
+{
+ EBUG_ON(!idx || idx > t->size);
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+ struct bkey_packed *end = idx < t->size
+ ? rw_aux_to_bkey(b, t, idx)
+ : btree_bkey_last(b, t);
+
+ if (t->size < bset_rw_tree_capacity(b, t) &&
+ (void *) end - (void *) start > L1_CACHE_BYTES) {
+ struct bkey_packed *k = start;
+
+ while (1) {
+ k = bkey_p_next(k);
+ if (k == end)
+ break;
+
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
+ memmove(&rw_aux_tree(b, t)[idx + 1],
+ &rw_aux_tree(b, t)[idx],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx]);
+ t->size++;
+ rw_aux_tree_set(b, t, idx, k);
+ break;
+ }
+ }
+ }
+}
+
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
@@ -922,84 +924,59 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
- unsigned l, j, where = __btree_node_key_to_offset(b, _where);
+ unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
+ if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+ rw_aux_tree_insert_entry(b, t, t->size);
+ goto verify;
+ }
+
/* returns first entry >= where */
- l = rw_aux_tree_bsearch(b, t, where);
-
- if (!l) /* never delete first entry */
- l++;
- else if (l < t->size &&
- where < t->end_offset &&
- rw_aux_tree(b, t)[l].offset == where)
- rw_aux_tree_set(b, t, l++, _where);
-
- /* l now > where */
-
- for (j = l;
- j < t->size &&
- rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
- j++)
- ;
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset + shift ==
- rw_aux_tree(b, t)[l - 1].offset)
- j++;
-
- memmove(&rw_aux_tree(b, t)[l],
- &rw_aux_tree(b, t)[j],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[j]);
- t->size -= j - l;
-
- for (j = l; j < t->size; j++)
- rw_aux_tree(b, t)[j].offset += shift;
+ idx = rw_aux_tree_bsearch(b, t, where);
+
+ if (rw_aux_tree(b, t)[idx].offset == where) {
+ if (!idx) { /* never delete first entry */
+ idx++;
+ } else if (where < t->end_offset) {
+ rw_aux_tree_set(b, t, idx++, _where);
+ } else {
+ EBUG_ON(where != t->end_offset);
+ rw_aux_tree_insert_entry(b, t, --t->size);
+ goto verify;
+ }
+ }
- EBUG_ON(l < t->size &&
- rw_aux_tree(b, t)[l].offset ==
- rw_aux_tree(b, t)[l - 1].offset);
+ EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+ if (idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset + shift ==
+ rw_aux_tree(b, t)[idx - 1].offset) {
+ memmove(&rw_aux_tree(b, t)[idx],
+ &rw_aux_tree(b, t)[idx + 1],
+ (void *) &rw_aux_tree(b, t)[t->size] -
+ (void *) &rw_aux_tree(b, t)[idx + 1]);
+ t->size -= 1;
+ }
- if (t->size < bset_rw_tree_capacity(b, t) &&
- (l < t->size
- ? rw_aux_tree(b, t)[l].offset
- : t->end_offset) -
- rw_aux_tree(b, t)[l - 1].offset >
- L1_CACHE_BYTES / sizeof(u64)) {
- struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
- struct bkey_packed *end = l < t->size
- ? rw_aux_to_bkey(b, t, l)
- : btree_bkey_last(b, t);
- struct bkey_packed *k = start;
+ for (j = idx; j < t->size; j++)
+ rw_aux_tree(b, t)[j].offset += shift;
- while (1) {
- k = bkey_p_next(k);
- if (k == end)
- break;
+ EBUG_ON(idx < t->size &&
+ rw_aux_tree(b, t)[idx].offset ==
+ rw_aux_tree(b, t)[idx - 1].offset);
- if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
- memmove(&rw_aux_tree(b, t)[l + 1],
- &rw_aux_tree(b, t)[l],
- (void *) &rw_aux_tree(b, t)[t->size] -
- (void *) &rw_aux_tree(b, t)[l]);
- t->size++;
- rw_aux_tree_set(b, t, l, k);
- break;
- }
- }
- }
+ rw_aux_tree_insert_entry(b, t, idx);
+verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}
void bch2_bset_insert(struct btree *b,
- struct btree_node_iter *iter,
struct bkey_packed *where,
struct bkey_i *insert,
unsigned clobber_u64s)
@@ -1098,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p)
}
static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
- const struct bkey_float *f,
- unsigned idx)
+ const struct bkey_float *f)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
@@ -1133,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
goto slowpath;
l = f->mantissa;
- r = bkey_mantissa(packed_search, f, n);
+ r = bkey_mantissa(packed_search, f);
- if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
goto slowpath;
n = n * 2 + (l < r);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5c6c7a14fa0f..6953d55b72cc 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
- struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
+ unsigned);
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
/* Bkey utility code */
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e52a06d3418c..6e4afb2b5441 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,11 +15,12 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/swap.h>
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
do { \
if (shrinker_counter) \
- bc->not_freed_##counter++; \
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
} while (0)
const char * const bch2_btree_node_flags[] = {
@@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = {
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
- unsigned i, reserve = 16;
+ unsigned reserve = 16;
if (!c->btree_roots_known[0].b)
reserve += 8;
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
}
- c->btree_cache.reserve = reserve;
+ c->btree_cache.nr_reserve = reserve;
}
-static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
- return max_t(int, 0, bc->used - bc->reserve);
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+
+ size_t can_free = list->nr;
+ if (!list->idx)
+ can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
+ return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
+ BUG_ON(btree_node_hashed(b));
+
+ /*
+ * This should really be done in slub/vmalloc, but we're using the
+ * kmalloc_large() path, so we're working around a slub bug by doing
+ * this here:
+ */
+ if (b->data)
+ mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
+ if (b->aux_data)
+ mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
+
EBUG_ON(btree_node_write_in_flight(b));
clear_btree_node_just_written(b);
@@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
- bc->used--;
+ bc->nr_freeable--;
btree_node_to_freedlist(bc, b);
}
@@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
@@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
bch2_btree_lock_init(&b->c, 0);
- bc->used++;
+ bc->nr_freeable++;
list_add(&b->list, &bc->freeable);
return b;
}
@@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
+static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
+{
+ struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+ u64 mask = bc->pinned_nodes_mask[!!b->c.level];
+
+ return ((mask & BIT_ULL(b->c.btree_id)) &&
+ bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+ bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
+}
+
+void bch2_node_pin(struct bch_fs *c, struct btree *b)
+{
+ struct btree_cache *bc = &c->btree_cache;
+
+ mutex_lock(&bc->lock);
+ BUG_ON(!__btree_node_pinned(bc, b));
+ if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
+ set_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[1].list);
+ bc->live[0].nr--;
+ bc->live[1].nr++;
+ }
+ mutex_unlock(&bc->lock);
+}
+
+void bch2_btree_cache_unpin(struct bch_fs *c)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b, *n;
+
+ mutex_lock(&bc->lock);
+ c->btree_cache.pinned_nodes_mask[0] = 0;
+ c->btree_cache.pinned_nodes_mask[1] = 0;
+
+ list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
+ clear_btree_node_pinned(b);
+ list_move(&b->list, &bc->live[0].list);
+ bc->live[0].nr++;
+ bc->live[1].nr--;
+ }
+
+ mutex_unlock(&bc->lock);
+}
+
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
+ lockdep_assert_held(&bc->lock);
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
@@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
b->hash_val = 0;
if (b->c.btree_id < BTREE_ID_NR)
- --bc->used_by_btree[b->c.btree_id];
+ --bc->nr_by_btree[b->c.btree_id];
+
+ bc->live[btree_node_pinned(b)].nr--;
+ bc->nr_freeable++;
+ list_move(&b->list, &bc->freeable);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
@@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
- if (!ret && b->c.btree_id < BTREE_ID_NR)
- bc->used_by_btree[b->c.btree_id]++;
- return ret;
+ if (ret)
+ return ret;
+
+ if (b->c.btree_id < BTREE_ID_NR)
+ bc->nr_by_btree[b->c.btree_id]++;
+
+ bool p = __btree_node_pinned(bc, b);
+ mod_bit(BTREE_NODE_pinned, &b->flags, p);
+
+ list_move_tail(&b->list, &bc->live[p].list);
+ bc->live[p].nr++;
+
+ bc->nr_freeable--;
+ return 0;
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
- int ret;
-
b->c.level = level;
b->c.btree_id = id;
mutex_lock(&bc->lock);
- ret = __bch2_btree_node_hash_insert(bc, b);
- if (!ret)
- list_add_tail(&b->list, &bc->live);
+ int ret = __bch2_btree_node_hash_insert(bc, b);
mutex_unlock(&bc->lock);
return ret;
@@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
-
- struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
-
- u64 mask = b->c.level
- ? bc->pinned_nodes_interior_mask
- : bc->pinned_nodes_leaf_mask;
-
- if ((mask & BIT_ULL(b->c.btree_id)) &&
- bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
- bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
-
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
+ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
- bool trigger_writes = atomic_read(&bc->dirty) + nr >=
- bc->used * 3 / 4;
+ bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
- can_free = btree_cache_can_free(bc);
+ can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
- bc->freed++;
+ bc->nr_freed++;
}
}
restart:
- list_for_each_entry_safe(b, t, &bc->live, list) {
+ list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- bc->not_freed_access_bit++;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
+ --touched;;
} else if (!btree_node_reclaim(c, b, true)) {
+ bch2_btree_node_hash_remove(bc, b);
+
freed++;
btree_node_data_free(c, b);
- bc->freed++;
+ bc->nr_freed++;
- bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -450,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
- list_move(&bc->live, &b->list);
+ list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@@ -464,8 +531,8 @@ restart:
break;
}
out_rotate:
- if (&t->list != &bc->live)
- list_move_tail(&bc->live, &t->list);
+ if (&t->list != &list->list)
+ list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@@ -478,44 +545,45 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = shrink->private_data;
- struct btree_cache *bc = &c->btree_cache;
+ struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
- return btree_cache_can_free(bc);
+ return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
- unsigned i, flags;
+ struct btree *b, *t;
+ unsigned long flags;
- shrinker_free(bc->shrink);
+ shrinker_free(bc->live[1].shrink);
+ shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
- list_move(&c->verify_data->list, &bc->live);
+ list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
- for (i = 0; i < btree_id_nr_alive(c); i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
- list_add(&r->b->list, &bc->live);
+ list_add(&r->b->list, &bc->live[0].list);
}
- list_splice(&bc->freeable, &bc->live);
-
- while (!list_empty(&bc->live)) {
- b = list_first_entry(&bc->live, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->live[1].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->live[0].list, list)
+ bch2_btree_node_hash_remove(bc, b);
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
@@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
}
BUG_ON(!bch2_journal_error(&c->journal) &&
- atomic_read(&c->btree_cache.dirty));
+ atomic_long_read(&c->btree_cache.nr_dirty));
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
- while (!list_empty(&bc->freed_nonpcpu)) {
- b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
+ list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
list_del(&b->list);
six_lock_exit(&b->c.lock);
kfree(b);
@@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ BUG_ON(bc->nr_by_btree[i]);
+ BUG_ON(bc->live[0].nr);
+ BUG_ON(bc->live[1].nr);
+ BUG_ON(bc->nr_freeable);
+
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
@@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->reserve; i++)
+ for (i = 0; i < bc->nr_reserve; i++)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
- list_splice_init(&bc->live, &bc->freeable);
+ list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
- bc->shrink = shrink;
+ bc->live[0].shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->seeks = 2;
+ shrink->private_data = &bc->live[0];
+ shrinker_register(shrink);
+
+ shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
+ if (!shrink)
+ goto err;
+ bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
- shrink->seeks = 4;
- shrink->private_data = c;
+ shrink->seeks = 8;
+ shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@@ -582,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
- INIT_LIST_HEAD(&bc->live);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
+ bc->live[i].idx = i;
+ INIT_LIST_HEAD(&bc->live[i].list);
+ }
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b, false))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_reclaim(c, b, false))
+ return b;
while (1) {
- list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_write_and_reclaim(c, b))
- return b;
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
+ list_for_each_entry_reverse(b, &bc->live[i].list, list)
+ if (!btree_node_write_and_reclaim(c, b))
+ return b;
/*
* Rare case: all nodes were intent-locked.
@@ -671,9 +759,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
- unsigned flags;
- flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@@ -725,7 +811,7 @@ got_node:
}
mutex_lock(&bc->lock);
- bc->used++;
+ bc->nr_freeable++;
got_mem:
mutex_unlock(&bc->lock);
@@ -745,8 +831,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
- memalloc_nofs_restore(flags);
-
int ret = bch2_trans_relock(trans);
if (unlikely(ret)) {
bch2_btree_node_to_freelist(c, b);
@@ -781,7 +865,6 @@ err:
}
mutex_unlock(&bc->lock);
- memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}
@@ -1269,8 +1352,8 @@ wait_on_io:
BUG_ON(btree_node_dirty(b));
mutex_lock(&bc->lock);
- btree_node_data_free(c, b);
bch2_btree_node_hash_remove(bc, b);
+ btree_node_data_free(c, b);
mutex_unlock(&bc->lock);
out:
six_unlock_write(&b->c.lock);
@@ -1342,13 +1425,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
}
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
- const char *label, unsigned nr)
+ const char *label, size_t nr)
{
prt_printf(out, "%s\t", label);
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
- prt_printf(out, " (%u)\n", nr);
+ prt_printf(out, " (%zu)\n", nr);
}
+static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
+#define x(n) #n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ NULL
+};
+
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
@@ -1356,24 +1446,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
- prt_btree_cache_line(out, c, "total:", bc->used);
- prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty));
+ prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
+ prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
+ prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);
- for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++)
- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]);
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
+ prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
prt_newline(out);
- prt_printf(out, "freed:\t%u\n", bc->freed);
+ prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
- prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty);
- prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight);
- prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight);
- prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent);
- prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write);
- prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit);
- prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict);
- prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked);
- prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
+ prt_printf(out, " %s\t%llu\n",
+ bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index f82064007127..367acd217c6a 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
+void bch2_node_pin(struct bch_fs *, struct btree *);
+void bch2_btree_cache_unpin(struct bch_fs *);
+
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index eb3002c4eae7..660d2fa02da2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c)
struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
+ bch2_trans_srcu_unlock(trans);
+
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
@@ -549,9 +551,8 @@ reconstruct_root:
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
r->b = NULL;
@@ -600,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k.k->version.lo > atomic64_read(&c->journal.seq));
+ k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
- k.k->version.lo > atomic64_read(&c->key_version),
+ k.k->bversion.lo > atomic64_read(&c->key_version),
trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- atomic64_set(&c->key_version, k.k->version.lo);
+ atomic64_set(&c->key_version, k.k->bversion.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
@@ -753,10 +754,8 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(c, ca) {
- kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
- ca->buckets_gc = NULL;
- }
+ for_each_member_device(c, ca)
+ genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
@@ -910,20 +909,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
int ret = 0;
for_each_member_device(c, ca) {
- struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!buckets) {
+ ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+ if (ret) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
break;
}
-
- buckets->first_bucket = ca->mi.first_bucket;
- buckets->nbuckets = ca->mi.nbuckets;
- buckets->nbuckets_minus_first =
- buckets->nbuckets - buckets->first_bucket;
- rcu_assign_pointer(ca->buckets_gc, buckets);
}
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 56ea9a77cd4a..1c1448b52207 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset(b, b->set, &b->data->keys);
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+ memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
+ btree_buf_bytes(b) -
+ sizeof(struct btree_node) -
+ b->nr.live_u64s * sizeof(u64));
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
@@ -1219,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bkey_val_validate(c, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
- !bversion_cmp(u.k->version, MAX_VERSION))) {
+ !bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -1666,7 +1670,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_btree_pos_to_text(&buf, c, b);
bch_err_ratelimited(c, "%s", buf.buf);
- if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
bch2_fatal_error(c);
@@ -1749,10 +1753,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
ret = -BCH_ERR_btree_node_read_error;
@@ -2031,7 +2033,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
do_write:
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 63d76f5c6403..9b01ca3de907 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -18,13 +18,13 @@ struct btree_node_read_all;
static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
- atomic_inc(&c->btree_cache.dirty);
+ atomic_long_inc(&c->btree_cache.nr_dirty);
}
static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
{
if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
- atomic_dec(&c->btree_cache.dirty);
+ atomic_long_dec(&c->btree_cache.nr_dirty);
}
static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2e84d22e17bd..bfe9f0c1e1be 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1010,9 +1010,9 @@ retry_all:
* the same position:
*/
if (trans->paths[idx].uptodate) {
- __btree_path_get(&trans->paths[idx], false);
+ __btree_path_get(trans, &trans->paths[idx], false);
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
- __btree_path_put(&trans->paths[idx], false);
+ __btree_path_put(trans, &trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(!trans->srcu_held))
bch2_trans_srcu_lock(trans);
+ trace_btree_path_traverse_start(trans, path);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
+ trace_btree_path_traverse_end(trans, path);
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
panic("ret %s (%i) trans->restarted %s (%i)\n",
@@ -1225,7 +1228,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
- __btree_path_get(trans->paths + new, intent);
+ __btree_path_get(trans, trans->paths + new, intent);
#ifdef TRACK_PATH_ALLOCATED
trans->paths[new].ip_allocated = ip;
#endif
@@ -1236,8 +1239,10 @@ __flatten
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(trans->paths + path, intent);
+ struct btree_path *old = trans->paths + path;
+ __btree_path_put(trans, trans->paths + path, intent);
path = btree_path_clone(trans, path, intent, ip);
+ trace_btree_path_clone(trans, old, trans->paths + path);
trans->paths[path].preserve = false;
return path;
}
@@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
+ trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
+
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
struct btree_path *path = trans->paths + path_idx;
@@ -1361,13 +1368,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
{
struct btree_path *path = trans->paths + path_idx, *dup;
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans, path, intent))
return;
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
+ trace_btree_path_free(trans, path_idx, dup);
+
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
@@ -1392,7 +1401,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- if (!__btree_path_put(trans->paths + path, intent))
+ if (!__btree_path_put(trans, trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- prt_printf(buf, "transaction updates for %s journal seq %llu\n",
- trans->fn, trans->journal_res.seq);
+ prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
+ trans->nr_updates, trans->fn, trans->journal_res.seq);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
@@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
@@ -1716,14 +1725,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
trans->paths[path_pos].level == level) {
- __btree_path_get(trans->paths + path_pos, intent);
+ trace_btree_path_get(trans, trans->paths + path_pos, &pos);
+
+ __btree_path_get(trans, trans->paths + path_pos, intent);
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
path = trans->paths + path_idx;
} else {
path_idx = btree_path_alloc(trans, path_pos);
path = trans->paths + path_idx;
- __btree_path_get(path, intent);
+ __btree_path_get(trans, path, intent);
path->pos = pos;
path->btree_id = btree_id;
path->cached = cached;
@@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
path->ip_allocated = ip;
#endif
trans->paths_sorted = false;
+
+ trace_btree_path_alloc(trans, path);
}
if (!(flags & BTREE_ITER_nopreserve))
@@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_path *path = btree_iter_path(trans, iter);
if (btree_path_node(path, path->level))
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
return 0;
}
@@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
@@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
@@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@@ -2326,7 +2339,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(trans->paths + iter->update_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_all_snapshots))
@@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
path = btree_iter_path(trans, iter);
+ trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
saved_k = *k.k;
saved_v = k.v;
}
@@ -2527,7 +2541,7 @@ got_key:
continue;
}
- btree_path_set_should_be_locked(path);
+ btree_path_set_should_be_locked(trans, path);
break;
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
@@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2712,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
+/* Obsolete, but still used by rust wrapper in -tools */
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
struct bkey_s_c k;
@@ -2911,9 +2926,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
- __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
- __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
+ __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@@ -3237,7 +3252,7 @@ void bch2_trans_put(struct btree_trans *trans)
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
- __btree_path_put(trans->paths + i->path, true);
+ __btree_path_put(trans, trans->paths + i->path, true);
trans->nr_updates = 0;
check_btree_paths_leaked(trans);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 222b7ce8a901..78e63ad7d380 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -6,6 +6,12 @@
#include "btree_types.h"
#include "trace.h"
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
static inline int __bkey_err(const struct bkey *k)
{
return PTR_ERR_OR_ZERO(k);
@@ -13,16 +19,28 @@ static inline int __bkey_err(const struct bkey *k)
#define bkey_err(_k) __bkey_err((_k).k)
-static inline void __btree_path_get(struct btree_path *path, bool intent)
+static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ unsigned idx = path - trans->paths;
+
+ EBUG_ON(!test_bit(idx, trans->paths_allocated));
+ if (unlikely(path->ref == U8_MAX)) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("path %u refcount overflow\n", idx);
+ }
+
path->ref++;
path->intent_ref += intent;
+ trace_btree_path_get_ll(trans, path);
}
-static inline bool __btree_path_put(struct btree_path *path, bool intent)
+static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
{
+ EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
EBUG_ON(!path->ref);
EBUG_ON(!path->intent_ref && intent);
+
+ trace_btree_path_put_ll(trans, path);
path->intent_ref -= intent;
return --path->ref == 0;
}
@@ -511,6 +529,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *);
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
size = roundup(size, 8);
@@ -814,20 +838,6 @@ transaction_restart: \
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@@ -868,7 +878,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, _do); \
+ _ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
@@ -881,7 +891,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
- _ret = drop_locks_do(trans, ((_p = _do), 0)); \
+ _ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
@@ -894,12 +904,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret; \
})
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index fda7998734cb..244610b1d0b5 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -79,134 +79,47 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
-static void bkey_cached_evict(struct btree_key_cache *c,
+static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
- BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
- bch2_btree_key_cache_params));
- memset(&ck->key, ~0, sizeof(ck->key));
-
- atomic_long_dec(&c->nr_keys);
-}
-
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- if (ck->c.lock.readers) {
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- } else {
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
+ bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+ bch2_btree_key_cache_params);
+ if (ret) {
+ memset(&ck->key, ~0, sizeof(ck->key));
+ atomic_long_dec(&c->nr_keys);
}
- atomic_long_inc(&bc->nr_freed);
-
- kfree(ck->k);
- ck->k = NULL;
- ck->u64s = 0;
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
+ return ret;
}
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
- struct bkey_cached *pos;
-
- bc->nr_freed_nonpcpu++;
+ struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
+ struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
- list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
- if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
- pos->btree_trans_barrier_seq)) {
- list_move(&ck->list, &pos->list);
- return;
- }
- }
-
- list_move(&ck->list, &bc->freed_nonpcpu);
+ this_cpu_dec(*c->btree_key_cache.nr_pending);
+ kmem_cache_free(bch2_key_cache, ck);
}
-#endif
-
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
- struct bkey_cached *ck)
-{
- BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
- if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
- bool freed = false;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- if (f->nr < ARRAY_SIZE(f->objs)) {
- f->objs[f->nr++] = ck;
- freed = true;
- }
- preempt_enable();
- if (!freed) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (f->nr > ARRAY_SIZE(f->objs) / 2) {
- struct bkey_cached *ck2 = f->objs[--f->nr];
-
- __bkey_cached_move_to_freelist_ordered(bc, ck2);
- }
- preempt_enable();
-
- __bkey_cached_move_to_freelist_ordered(bc, ck);
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_nonpcpu);
- bc->nr_freed_nonpcpu++;
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- list_move_tail(&ck->list, &bc->freed_pcpu);
- bc->nr_freed_pcpu++;
- mutex_unlock(&bc->lock);
- }
-}
-
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static void bkey_cached_free(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
- ck->btree_trans_barrier_seq =
- start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
- list_del_init(&ck->list);
- atomic_long_inc(&bc->nr_freed);
-
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
- bkey_cached_move_to_freelist(bc, ck);
-
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
+
+ bool pcpu_readers = ck->c.lock.readers != NULL;
+ rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+ this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
{
+ gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
+
struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
if (unlikely(!ck))
return NULL;
@@ -224,74 +137,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
- if (!pcpu_readers) {
-#ifdef __KERNEL__
- struct btree_key_cache_freelist *f;
-
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
- if (f->nr)
- ck = f->objs[--f->nr];
- preempt_enable();
-
- if (!ck) {
- mutex_lock(&bc->lock);
- preempt_disable();
- f = this_cpu_ptr(bc->pcpu_freed);
-
- while (!list_empty(&bc->freed_nonpcpu) &&
- f->nr < ARRAY_SIZE(f->objs) / 2) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- f->objs[f->nr++] = ck;
- }
-
- ck = f->nr ? f->objs[--f->nr] : NULL;
- preempt_enable();
- mutex_unlock(&bc->lock);
- }
-#else
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_nonpcpu)) {
- ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_nonpcpu--;
- }
- mutex_unlock(&bc->lock);
-#endif
- } else {
- mutex_lock(&bc->lock);
- if (!list_empty(&bc->freed_pcpu)) {
- ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
- list_del_init(&ck->list);
- bc->nr_freed_pcpu--;
- }
- mutex_unlock(&bc->lock);
- }
-
- if (ck) {
- ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
- if (unlikely(ret)) {
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
-
- ret = bch2_btree_node_lock_write(trans, path, &ck->c);
- if (unlikely(ret)) {
- btree_node_unlock(trans, path, 0);
- bkey_cached_move_to_freelist(bc, ck);
- return ERR_PTR(ret);
- }
-
- return ck;
- }
+ struct bkey_cached *ck = container_of_or_null(
+ rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
@@ -302,15 +155,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
return ERR_PTR(ret);
}
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ if (ck) {
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ ck->c.cached = true;
+ goto lock;
+ }
- ck->c.cached = true;
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
+ ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+ struct bkey_cached, rcu);
+ if (ck)
+ goto lock;
+lock:
+ six_lock_intent(&ck->c.lock, NULL, NULL);
+ six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@@ -322,21 +179,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
- mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
- goto out;
+ if (bkey_cached_evict(c, ck))
+ goto out;
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
- mutex_unlock(&c->lock);
return ck;
}
@@ -415,7 +272,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@@ -611,8 +468,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
- bkey_cached_evict(&c->btree_key_cache, ck);
- bkey_cached_free_fast(&c->btree_key_cache, ck);
+ if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+ bkey_cached_free(&c->btree_key_cache, ck);
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ }
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@@ -722,7 +583,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
- bkey_cached_free_fast(bc, ck);
+ bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -735,48 +596,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
- struct bkey_cached *ck, *t;
+ struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
- unsigned start, flags;
+ unsigned iter, start;
int srcu_idx;
- mutex_lock(&bc->lock);
- bc->requested_to_free += sc->nr_to_scan;
-
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- flags = memalloc_nofs_save();
-
- /*
- * Newest freed entries are at the end of the list - once we hit one
- * that's too new to be freed, we can bail out:
- */
- list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_nonpcpu--;
- bc->freed++;
- }
-
- list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
- if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
- ck->btree_trans_barrier_seq))
- break;
-
- list_del(&ck->list);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- atomic_long_dec(&bc->nr_freed);
- bc->nr_freed_pcpu--;
- bc->freed++;
- }
-
rcu_read_lock();
+
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
@@ -792,17 +619,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
}
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- start = bc->shrink_iter;
+ iter = bc->shrink_iter;
+ if (iter >= tbl->size)
+ iter = 0;
+ start = iter;
do {
struct rhash_head *pos, *next;
- pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
+ pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
- next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -812,29 +640,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
- } else {
- bkey_cached_evict(bc, ck);
+ } else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
- bc->moved_to_freelist++;
+ bc->freed++;
freed++;
+ } else {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
- break;
+ goto out;
pos = next;
}
- bc->shrink_iter++;
- if (bc->shrink_iter >= tbl->size)
- bc->shrink_iter = 0;
- } while (scanned < nr && bc->shrink_iter != start);
+ iter++;
+ if (iter >= tbl->size)
+ iter = 0;
+ } while (scanned < nr && iter != start);
+out:
+ bc->shrink_iter = iter;
rcu_read_unlock();
- memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- mutex_unlock(&bc->lock);
return freed;
}
@@ -862,18 +692,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
- struct bkey_cached *ck, *n;
+ struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
-#ifdef __KERNEL__
- int cpu;
-#endif
shrinker_free(bc->shrink);
- mutex_lock(&bc->lock);
-
/*
* The loop is needed to guard against racing with rehash:
*/
@@ -892,44 +717,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &items);
+ BUG_ON(!bkey_cached_evict(bc, ck));
+ kfree(ck->k);
+ kmem_cache_free(bch2_key_cache, ck);
}
}
rcu_read_unlock();
}
-#ifdef __KERNEL__
- if (bc->pcpu_freed) {
- for_each_possible_cpu(cpu) {
- struct btree_key_cache_freelist *f =
- per_cpu_ptr(bc->pcpu_freed, cpu);
-
- for (i = 0; i < f->nr; i++) {
- ck = f->objs[i];
- list_add(&ck->list, &items);
- }
- }
- }
-#endif
-
- BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
- BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
- list_splice(&bc->freed_pcpu, &items);
- list_splice(&bc->freed_nonpcpu, &items);
-
- mutex_unlock(&bc->lock);
-
- list_for_each_entry_safe(ck, n, &items, list) {
- cond_resched();
-
- list_del(&ck->list);
- kfree(ck->k);
- six_lock_exit(&ck->c.lock);
- kmem_cache_free(bch2_key_cache, ck);
- }
-
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
test_bit(BCH_FS_was_rw, &c->flags))
@@ -943,14 +738,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
- free_percpu(bc->pcpu_freed);
+ rcu_pending_exit(&bc->pending[0]);
+ rcu_pending_exit(&bc->pending[1]);
+
+ free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
- mutex_init(&c->lock);
- INIT_LIST_HEAD(&c->freed_pcpu);
- INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -958,11 +753,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
-#ifdef __KERNEL__
- bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
- if (!bc->pcpu_freed)
+ bc->nr_pending = alloc_percpu(size_t);
+ if (!bc->nr_pending)
+ return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+ if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+ rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@@ -984,45 +781,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
- struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
- unsigned flags = memalloc_nofs_save();
- mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
- prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
- prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
-
- prt_printf(out, "\nshrinker:\n");
+ prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
+ prt_newline(out);
+ prt_printf(out, "shrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
- prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
-
- prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
-
- struct bkey_cached *ck;
- unsigned iter = 0;
- list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
- prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
-
- iter = 0;
- list_for_each_entry(ck, &bc->freed_pcpu, list) {
- prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
- if (++iter > 10)
- break;
- }
- mutex_unlock(&bc->lock);
- memalloc_flags_restore(flags);
+ prt_newline(out);
+ prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 237e8bb3ac40..722f1ed10551 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -2,33 +2,25 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
+#include "rcu_pending.h"
struct btree_key_cache {
- struct mutex lock;
struct rhashtable table;
bool table_init_done;
- struct list_head freed_pcpu;
- size_t nr_freed_pcpu;
- struct list_head freed_nonpcpu;
- size_t nr_freed_nonpcpu;
-
struct shrinker *shrink;
unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
- atomic_long_t nr_freed;
+ /* 0: non pcpu reader locks, 1: pcpu reader locks */
+ struct rcu_pending pending[2];
+ size_t __percpu *nr_pending;
+
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
- unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11a64ead8685..7c07f9fa9add 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -218,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
bool lock_may_not_fail,
unsigned long ip)
{
- int ret;
-
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
- ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
- bch2_six_check_for_deadlock, trans, ip);
+ int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
+
+ if (!ret)
+ trace_btree_path_lock(trans, _THIS_IP_, b);
return ret;
}
@@ -281,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
+ bch2_trans_verify_not_unlocked(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -400,12 +402,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
/* misc: */
-static inline void btree_path_set_should_be_locked(struct btree_path *path)
+static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
path->should_be_locked = true;
+ trace_btree_path_should_be_locked(trans, path);
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index b28c649c6838..1e694fedc5da 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = IS_ERR_OR_NULL(t);
+ ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
closure_put(&cl);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index a0101d9c5d83..1a74a1a252ee 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
overwrite:
- bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+ bch2_bset_insert(b, k, insert, clobber_u64s);
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)
@@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
+ i->k->k.bversion.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys)
trans_for_each_update(trans, i)
- i->k->k.version = MAX_VERSION;
+ i->k->k.bversion = MAX_VERSION;
}
h = trans->hooks;
@@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct jset_entry *entry = trans->journal_entries;
- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- percpu_down_read(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
+
+ for (entry = trans->journal_entries;
+ entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry->start->k.type == KEY_TYPE_accounting) {
+ BUG_ON(!trans->journal_res.ref);
+
+ struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) entry - (u64 *) trans->journal_entries);
+ BUG_ON(bversion_zero(a->k.bversion));
- a->k.version = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) entry - (u64 *) trans->journal_entries);
- BUG_ON(bversion_zero(a->k.version));
- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false);
+ if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
if (ret)
goto revert_fs_usage;
}
- percpu_up_read(&c->mark_lock);
+ }
+ percpu_up_read(&c->mark_lock);
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
trans_for_each_update(trans, i)
if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
@@ -735,6 +739,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
+ trans_for_each_update(trans, i) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, invalid_flags);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
+ goto fatal_err;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+ trans->fn);
+ goto fatal_err;
+ }
+ }
+
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -798,7 +836,7 @@ revert_fs_usage:
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, false, false);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
bch2_accounting_neg(a);
}
percpu_up_read(&c->mark_lock);
@@ -1019,40 +1057,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (ret)
goto out_reset;
- trans_for_each_update(trans, i) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- return ret;
- }
- btree_insert_entry_checks(trans, i);
- }
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i)) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags);
- if (unlikely(ret)) {
- bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
- trans->fn);
- return ret;
- }
- }
-
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b256b2a20a4f..4568a41fefaf 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -138,6 +138,31 @@ struct btree {
struct list_head list;
};
+#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(lock_intent) \
+ x(lock_write) \
+ x(dirty) \
+ x(read_in_flight) \
+ x(write_in_flight) \
+ x(noevict) \
+ x(write_blocked) \
+ x(will_make_reachable) \
+ x(access_bit)
+
+enum bch_btree_cache_not_freed_reasons {
+#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
+ BCH_BTREE_CACHE_NOT_FREED_REASONS()
+#undef x
+ BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
+};
+
+struct btree_cache_list {
+ unsigned idx;
+ struct shrinker *shrink;
+ struct list_head list;
+ size_t nr;
+};
+
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@@ -155,28 +180,19 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
- struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
+ struct btree_cache_list live[2];
- /* Number of elements in live + freeable lists */
- unsigned used;
- unsigned reserve;
- unsigned freed;
- unsigned not_freed_lock_intent;
- unsigned not_freed_lock_write;
- unsigned not_freed_dirty;
- unsigned not_freed_read_in_flight;
- unsigned not_freed_write_in_flight;
- unsigned not_freed_noevict;
- unsigned not_freed_write_blocked;
- unsigned not_freed_will_make_reachable;
- unsigned not_freed_access_bit;
- atomic_t dirty;
- struct shrinker *shrink;
+ size_t nr_freeable;
+ size_t nr_reserve;
+ size_t nr_by_btree[BTREE_ID_NR];
+ atomic_long_t nr_dirty;
- unsigned used_by_btree[BTREE_ID_NR];
+ /* shrinker stats */
+ size_t nr_freed;
+ u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
/*
* If we need to allocate memory for a new btree node and that
@@ -189,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
- u64 pinned_nodes_leaf_mask;
- u64 pinned_nodes_interior_mask;
+ /* btree id mask: 0 for leaves, 1 for interior */
+ u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@@ -386,17 +402,16 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
- unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
- struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
+ struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
@@ -583,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
- x(never_write)
+ x(never_write) \
+ x(pinned)
enum btree_flags {
/* First bits for btree node write type */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index d6f6df10dcc3..514df618548e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->key_cache_already_flushed = true;
i->flags |= BTREE_TRIGGER_norun;
- btree_path_set_should_be_locked(btree_path);
+ btree_path_set_should_be_locked(trans, btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
bch2_path_put(trans, path_idx, true);
@@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
break;
}
- if (!cmp && i < trans->updates + trans->nr_updates) {
+ bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
+
+ if (overwrite) {
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
@@ -449,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
}
}
- __btree_path_get(trans->paths + i->path, true);
+ __btree_path_get(trans, trans->paths + i->path, true);
+
+ trace_update_by_path(trans, path, i, overwrite);
/*
* If a key is present in the key cache, it must also exist in the
@@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
}
return 0;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 60393e98084d..6a454f2fa005 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
if (type && k.k->type != type)
return ERR_PTR(-ENOENT);
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+ /* extra padding for varint_decode_fast... */
+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
if (!IS_ERR(mut)) {
bkey_reassemble(mut, k);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8fd112026e7a..190bc1e81756 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "clock.h"
#include "error.h"
#include "extents.h"
+#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
@@ -145,7 +146,7 @@ fsck_err:
printbuf_exit(&buf);
return ret;
topology_repair:
- if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+ if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
bch2_inconsistent_error(c);
ret = -BCH_ERR_btree_need_topology_repair;
@@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+
+ mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
+ mutex_unlock(&c->btree_cache.lock);
+
__btree_node_free(trans, b);
+
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
- list_del_init(&b->list);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
@@ -732,6 +737,18 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s", bch2_err_str(ret));
err:
/*
+ * Ensure transaction is unlocked before using btree_node_lock_nopath()
+ * (the use of which is always suspect, we need to work on removing this
+ * in the future)
+ *
+ * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+ * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+ * rarely end up with a locked path besides the one we have here:
+ */
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
+
+ /*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
* recheck under btree_update_lock below:
@@ -750,18 +767,6 @@ err:
* we're in journal error state:
*/
- /*
- * Ensure transaction is unlocked before using
- * btree_node_lock_nopath() (the use of which is always suspect,
- * we need to work on removing this in the future)
- *
- * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
- * calls bch2_path_upgrade(), before we call path_make_mut(), so
- * we may rarely end up with a locked path besides the one we
- * have here:
- */
- bch2_trans_unlock(trans);
- bch2_trans_begin(trans);
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
- list_add_tail(&b->list, &c->btree_cache.live);
+ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
@@ -1981,7 +1986,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
- btree_path_set_should_be_locked(trans->paths + sib_path);
+ btree_path_set_should_be_locked(trans, trans->paths + sib_path);
m = trans->paths[sib_path].l[level].b;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 02c6ecada97c..10f400957f21 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
+ bch2_trans_verify_not_unlocked(trans);
+
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 721bbe1dffc1..546cd01a72e3 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
struct bch_dev_usage *usage)
{
+ if (out->nr_tabstops < 5) {
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 12);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ printbuf_tabstop_push(out, 16);
+ }
+
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
for (unsigned i = 0; i < BCH_DATA_NR; i++) {
@@ -272,7 +281,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
goto err;
rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
rcu_read_unlock();
if (level) {
@@ -477,7 +486,7 @@ out:
return ret;
err:
bch2_dump_trans_updates(trans);
- ret = -EIO;
+ ret = -BCH_ERR_bucket_ref_update;
goto out;
}
@@ -556,22 +565,24 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
s64 *sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
bool insert = !(flags & BTREE_TRIGGER_overwrite);
struct printbuf buf = PRINTBUF;
int ret = 0;
- struct bch_fs *c = trans->c;
+ u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
+ *sectors = insert ? abs_sectors : -abs_sectors;
+
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -EIO;
+ ret = -BCH_ERR_trigger_pointer;
goto err;
}
struct bpos bucket;
struct bch_backpointer bp;
- bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
- *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
+ __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
@@ -593,7 +604,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_trigger_pointer;
goto err_unlock;
}
@@ -638,7 +649,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -EIO;
+ ret = -BCH_ERR_trigger_stripe_pointer;
goto err;
}
@@ -677,7 +688,7 @@ err:
(u64) p.ec.idx, buf.buf);
printbuf_exit(&buf);
bch2_inconsistent_error(c);
- return -EIO;
+ return -BCH_ERR_trigger_stripe_pointer;
}
m->block_sectors[p.ec.block] += sectors;
@@ -741,7 +752,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
} else if (!p.has_ec) {
*replicas_sectors += disk_sectors;
- acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@@ -957,7 +968,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_type_str(a->v.data_type),
bch2_data_type_str(type),
bch2_data_type_str(type));
- ret = -EIO;
+ ret = -BCH_ERR_metadata_bucket_inconsistency;
goto err;
}
@@ -1013,7 +1024,7 @@ err:
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
- return -EIO;
+ return -BCH_ERR_metadata_bucket_inconsistency;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index edbdffd508fc..e2cb7b24b220 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
- return rcu_dereference_check(ca->buckets_gc,
- !ca->fs ||
- percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->state_lock) ||
- lockdep_is_held(&ca->bucket_lock));
-}
-
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
- struct bucket_array *buckets = gc_bucket_array(ca);
-
- if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
- return NULL;
- return buckets->b + b;
+ return genradix_ptr(&ca->buckets_gc, b);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index c9698cdf866f..28bd09a253c8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -19,14 +19,6 @@ struct bucket {
u32 stripe_sectors;
} __aligned(sizeof(long));
-struct bucket_array {
- struct rcu_head rcu;
- u16 first_bucket;
- size_t nbuckets;
- size_t nbuckets_minus_first;
- struct bucket b[];
-};
-
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index ef1f74866e23..cbfd88f98472 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -471,7 +471,6 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
static const struct file_operations bcachefs_data_ops = {
.release = bch2_data_job_release,
.read = bch2_data_job_read,
- .llseek = no_llseek,
};
static long bch2_ioctl_data(struct bch_fs *c,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index e7208bf1974e..ce8fc677bef9 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
- int ret;
skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
- ret = crypto_skcipher_encrypt(req);
+ int ret = crypto_skcipher_encrypt(req);
if (ret)
pr_err("got error %i from crypto_skcipher_encrypt()", ret);
@@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
void *buf, size_t len)
{
if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg,
- is_vmalloc_addr(buf)
- ? vmalloc_to_page(buf)
- : virt_to_page(buf),
- len, offset_in_page(buf));
+ struct scatterlist sg = {};
+
+ sg_mark_end(&sg);
+ sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
return do_encrypt_sg(tfm, nonce, &sg, len);
} else {
- unsigned pages = buf_pages(buf, len);
- struct scatterlist *sg;
- size_t orig_len = len;
- int ret, i;
-
- sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
- if (!sg)
- return -BCH_ERR_ENOMEM_do_encrypt;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
+ int ret;
- sg_init_table(sg, pages);
+ darray_init(&sgl);
- for (i = 0; i < pages; i++) {
+ while (len) {
unsigned offset = offset_in_page(buf);
- unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) vmalloc_to_page(buf),
+ .offset = offset,
+ .length = min(len, PAGE_SIZE - offset),
+ };
- sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
- buf += pg_len;
- len -= pg_len;
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+ if (ret)
+ goto err;
+
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
+ BUG_ON(darray_push(&sgl, sg));
+ }
+
+ buf += sg.length;
+ len -= sg.length;
+ sgl_len += sg.length;
}
- ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
- kfree(sg);
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
}
@@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- struct scatterlist sgl[16], *sg = sgl;
- size_t bytes = 0;
+ DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
+ size_t sgl_len = 0;
int ret = 0;
if (!bch2_csum_type_is_encryption(type))
return 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
+ darray_init(&sgl);
bio_for_each_segment(bv, bio, iter) {
- if (sg == sgl + ARRAY_SIZE(sgl)) {
- sg_mark_end(sg - 1);
-
- ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ struct scatterlist sg = {
+ .page_link = (unsigned long) bv.bv_page,
+ .offset = bv.bv_offset,
+ .length = bv.bv_len,
+ };
+
+ if (darray_push(&sgl, sg)) {
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
if (ret)
- return ret;
+ goto err;
- nonce = nonce_add(nonce, bytes);
- bytes = 0;
+ nonce = nonce_add(nonce, sgl_len);
+ sgl_len = 0;
+ sgl.nr = 0;
- sg_init_table(sgl, ARRAY_SIZE(sgl));
- sg = sgl;
+ BUG_ON(darray_push(&sgl, sg));
}
- sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
- bytes += bv.bv_len;
- }
-
- if (sg != sgl) {
- sg_mark_end(sg - 1);
- return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+ sgl_len += sg.length;
}
+ sg_mark_end(&darray_last(sgl));
+ ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
+err:
+ darray_exit(&sgl);
return ret;
}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 85c975dfbcfe..82c79c8baf92 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({ \
- long __ret = timeout; \
- might_sleep(); \
- if (!___wait_cond_timeout(condition)) \
- __ret = __wait_event_timeout(wq, condition, timeout); \
- __ret; \
-})
-
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
void bch2_io_clock_exit(struct io_clock *);
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index b7d223f85873..4f06cd8bbbe1 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -4,12 +4,12 @@
#include <linux/slab.h>
#include "darray.h"
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array(new_size, element_size, gfp);
+ void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4b340d13caac..8f4c3f0665c4 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
-
-static inline int __darray_resize(darray_char *d, size_t element_size,
- size_t new_size, gfp_t gfp)
-{
- return unlikely(new_size > d->size)
- ? __bch2_darray_resize(d, element_size, new_size, gfp)
- : 0;
-}
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
+
+#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp) \
+ (unlikely((_new_size) > (_d)->size) \
+ ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+ : 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
- unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+ __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
- return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
#define darray_make_room_gfp(_d, _more, _gfp) \
- __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+ darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 004894ad4147..462b1a2fe1ad 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -571,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
}
@@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->version;
+ m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 32bfdf19289a..84dd4a879d98 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -552,62 +552,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- subvol_inum target;
- u32 snapshot;
struct bkey_buf sk;
- int ret;
-
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inum.inum, ctx->pos, snapshot),
- POS(inum.inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+ POS(inum.inum, ctx->pos),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
- /* dir_emit() can fault and block: */
- bch2_bkey_buf_reassemble(&sk, c, k);
- struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
+ /* dir_emit() can fault and block: */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
- ret = bch2_dirent_read_target(trans, inum, dirent, &target);
- if (ret < 0)
- break;
- if (ret)
- continue;
+ subvol_inum target;
+ int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+ if (ret2 > 0)
+ continue;
- /*
- * read_target looks up subvolumes, we can overflow paths if the
- * directory has many subvolumes in it
- *
- * XXX: btree_trans_too_many_iters() is something we'd like to
- * get rid of, and there's no good reason to be using it here
- * except that we don't yet have a for_each_btree_key() helper
- * that does subvolume_get_snapshot().
- */
- ret = drop_locks_do(trans,
- bch2_dir_emit(ctx, dirent, target)) ?:
- btree_trans_too_many_iters(trans);
- if (ret) {
- ret = ret < 0 ? ret : 0;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
+ ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+ })));
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
- return ret;
+ return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e972e2bca546..9f3133e3e7e5 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
void *end = &acc_k + 1;
int ret = 0;
+ bkey_fsck_err_on(bversion_zero(k.k->bversion),
+ c, accounting_key_version_0,
+ "accounting key with version=0");
+
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_nr_inodes:
end = field_end(acc_k, nr_inodes);
@@ -291,7 +295,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
struct accounting_mem_entry n = {
.pos = a.k->p,
- .version = a.k->version,
+ .bversion = a.k->bversion,
.nr_counters = bch2_accounting_counters(a.k),
.v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
sizeof(u64), GFP_KERNEL),
@@ -319,11 +323,13 @@ err:
return -BCH_ERR_ENOMEM_disk_accounting;
}
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_replicas_padded r;
- if (accounting_to_replicas(&r.e, a.k->p) &&
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
return -BCH_ERR_btree_insert_need_mark_replicas;
@@ -566,7 +572,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false);
+ bch2_accounting_mem_mod_locked(trans,
+ bkey_i_to_s_c_accounting(&k_i.k),
+ BCH_ACCOUNTING_normal);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -589,30 +597,14 @@ fsck_err:
static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
if (k.k->type != KEY_TYPE_accounting)
return 0;
percpu_down_read(&c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true);
+ int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
+ BCH_ACCOUNTING_read);
percpu_up_read(&c->mark_lock);
-
- if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
- ret == -BCH_ERR_btree_insert_need_mark_replicas)
- ret = 0;
-
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (bch2_accounting_key_to_text(&buf, &acc),
- buf.buf)))
- ret = bch2_accounting_update_sb_one(c, k.k->p);
-fsck_err:
- printbuf_exit(&buf);
return ret;
}
@@ -624,6 +616,7 @@ int bch2_accounting_read(struct bch_fs *c)
{
struct bch_accounting_mem *acc = &c->accounting;
struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
int ret = for_each_btree_key(trans, iter,
BTREE_ID_accounting, POS_MIN,
@@ -647,7 +640,7 @@ int bch2_accounting_read(struct bch_fs *c)
accounting_pos_cmp, &k.k->p);
bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
+ bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
if (applied)
continue;
@@ -655,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (i + 1 < &darray_top(*keys) &&
i[1].k->k.type == KEY_TYPE_accounting &&
!journal_key_cmp(i, i + 1)) {
- BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0);
+ WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
i[1].journal_seq = i[0].journal_seq;
@@ -674,6 +667,45 @@ int bch2_accounting_read(struct bch_fs *c)
keys->gap = keys->nr = dst - keys->data;
percpu_down_read(&c->mark_lock);
+ for (unsigned i = 0; i < acc->k.nr; i++) {
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
+ continue;
+
+ struct bch_replicas_padded r;
+ if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
+ continue;
+
+ /*
+ * If the replicas entry is invalid it'll get cleaned up by
+ * check_allocations:
+ */
+ if (bch2_replicas_entry_validate(&r.e, c, &buf))
+ continue;
+
+ struct disk_accounting_pos k;
+ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &k),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_read(&c->mark_lock);
+ }
+ }
+
preempt_disable();
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
@@ -709,8 +741,10 @@ int bch2_accounting_read(struct bch_fs *c)
}
}
preempt_enable();
+fsck_err:
percpu_up_read(&c->mark_lock);
err:
+ printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f29fd0dd9581..4ea6c8a092bc 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
dst->v.d[i] += src.v->d[i];
- if (bversion_cmp(dst->k.version, src.k->version) < 0)
- dst->k.version = src.k->version;
+ if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
+ dst->k.bversion = src.k->bversion;
}
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
@@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
return bpos_cmp(*l, *r);
}
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
+enum bch_accounting_mode {
+ BCH_ACCOUNTING_normal,
+ BCH_ACCOUNTING_gc,
+ BCH_ACCOUNTING_read,
+};
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
*/
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
+ struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_fs *c = trans->c;
+ struct bch_accounting_mem *acc = &c->accounting;
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+ bool gc = mode == BCH_ACCOUNTING_gc;
+
+ EBUG_ON(gc && !acc->gc_running);
if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
return 0;
- if (!gc && !read) {
+ if (mode == BCH_ACCOUNTING_normal) {
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
}
}
- struct bch_accounting_mem *acc = &c->accounting;
unsigned idx;
- EBUG_ON(gc && !acc->gc_running);
-
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, gc);
+ int ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
index 1687a45177a7..b1982131b206 100644
--- a/fs/bcachefs/disk_accounting_types.h
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -6,7 +6,7 @@
struct accounting_mem_entry {
struct bpos pos;
- struct bversion version;
+ struct bversion bversion;
unsigned nr_counters;
u64 __percpu *v[2];
};
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 141a4c63142f..1587c6e1866a 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -18,6 +18,7 @@
#include "ec.h"
#include "error.h"
#include "io_read.h"
+#include "io_write.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
@@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
bch2_prt_csum_type(out, s.csum_type);
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+ if (s.disk_label) {
+ prt_str(out, " label");
+ bch2_disk_path_to_text(out, c, s.disk_label - 1);
+ }
+
for (unsigned i = 0; i < s.nr_blocks; i++) {
const struct bch_extent_ptr *ptr = sp->ptrs + i;
if ((void *) ptr >= bkey_val_end(k))
break;
+ prt_char(out, ' ');
bch2_extent_ptr_to_text(out, c, ptr);
if (s.csum_type < BCH_CSUM_NR &&
@@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
} else {
@@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
}
@@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
- if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -EIO;
+ if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
+ ret = -BCH_ERR_mark_stripe;
goto err;
}
@@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -EIO;
+ ret = -BCH_ERR_mark_stripe;
goto err_unlock;
}
@@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
+static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
+{
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->disk_label = s->disk_label;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+}
+
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
memset(m, 0, sizeof(*m));
} else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+ stripe_to_mem(m, new_s);
if (!old_s)
bch2_stripes_heap_insert(c, m, idx);
@@ -816,13 +829,16 @@ err:
}
/* recovery read path: */
-int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_buf *buf;
+ struct ec_stripe_buf *buf = NULL;
struct closure cl;
struct bch_stripe *v;
unsigned i, offset;
+ const char *msg = NULL;
+ struct printbuf msgbuf = PRINTBUF;
int ret = 0;
closure_init_stack(&cl);
@@ -835,32 +851,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: error %i looking up stripe", ret);
- kfree(buf);
- return -EIO;
+ msg = "stripe not found";
+ goto err;
}
v = &bkey_i_to_stripe(&buf->key)->v;
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: pointer doesn't match stripe");
- ret = -EIO;
+ msg = "pointer doesn't match stripe";
goto err;
}
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: read is bigger than stripe");
- ret = -EIO;
+ msg = "read is bigger than stripe";
goto err;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
- if (ret)
+ if (ret) {
+ msg = "-ENOMEM";
goto err;
+ }
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
@@ -868,9 +880,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- bch_err_ratelimited(c,
- "error doing reconstruct read: unable to read enough blocks");
- ret = -EIO;
+ msg = "unable to read enough blocks";
goto err;
}
@@ -882,10 +892,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
-err:
+out:
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
+err:
+ bch2_bkey_val_to_text(&msgbuf, c, orig_k);
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
+ printbuf_exit(&msgbuf);;
+ ret = -BCH_ERR_stripe_reconstruct;
+ goto out;
}
/* stripe bucket accounting: */
@@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
- bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
+ bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
@@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
}
-static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
+static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s = h->s;
+ lockdep_assert_held(&h->lock);
+
BUG_ON(!s->allocated && !s->err);
h->s = NULL;
@@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
ec_stripe_new_put(c, s, STRIPE_REF_io);
}
+static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
+{
+ h->s->err = err;
+ ec_stripe_new_set_pending(c, h);
+}
+
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
{
struct ec_stripe_new *s = ob->ec;
@@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
struct bkey_i *k,
unsigned nr_data,
unsigned nr_parity,
- unsigned stripe_size)
+ unsigned stripe_size,
+ unsigned disk_label)
{
struct bkey_i_stripe *s = bkey_stripe_init(k);
unsigned u64s;
@@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
s->v.nr_redundant = nr_parity;
s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
s->v.csum_type = BCH_CSUM_crc32c;
- s->v.pad = 0;
+ s->v.disk_label = disk_label;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
BUG_ON(1 << s->v.csum_granularity_bits >=
@@ -1685,40 +1711,32 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
s->nr_parity = h->redundancy;
ec_stripe_key_init(c, &s->new_stripe.key,
- s->nr_data, s->nr_parity, h->blocksize);
+ s->nr_data, s->nr_parity,
+ h->blocksize, h->disk_label);
h->s = s;
+ h->nr_created++;
return 0;
}
-static struct ec_stripe_head *
-ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy,
- enum bch_watermark watermark)
+static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
- struct ec_stripe_head *h;
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return NULL;
-
- mutex_init(&h->lock);
- BUG_ON(!mutex_trylock(&h->lock));
-
- h->target = target;
- h->algo = algo;
- h->redundancy = redundancy;
- h->watermark = watermark;
+ struct bch_devs_mask devs = h->devs;
rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, target);
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ unsigned nr_devs = dev_mask_nr(&h->devs);
for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
__clear_bit(ca->dev_idx, h->devs.d);
+ unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
h->blocksize = pick_blocksize(c, &h->devs);
+ h->nr_active_devs = 0;
for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1729,9 +1747,50 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
- if (h->nr_active_devs < h->redundancy + 2)
- bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
- h->nr_active_devs, h->redundancy + 2);
+ h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
+
+ if (h->insufficient_devs) {
+ const char *err;
+
+ if (nr_devs < h->redundancy + 2)
+ err = NULL;
+ else if (nr_devs_with_durability < h->redundancy + 2)
+ err = "cannot use durability=0 devices";
+ else
+ err = "mismatched bucket sizes";
+
+ if (err)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
+ h->nr_active_devs, h->redundancy + 2, err);
+ }
+
+ struct bch_devs_mask devs_leaving;
+ bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
+
+ if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
+ ec_stripe_new_cancel(c, h, -EINTR);
+
+ h->rw_devs_change_count = c->rw_devs_change_count;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
+ unsigned algo, unsigned redundancy,
+ enum bch_watermark watermark)
+{
+ struct ec_stripe_head *h;
+
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
+ if (!h)
+ return NULL;
+
+ mutex_init(&h->lock);
+ BUG_ON(!mutex_trylock(&h->lock));
+
+ h->disk_label = disk_label;
+ h->algo = algo;
+ h->redundancy = redundancy;
+ h->watermark = watermark;
list_add(&h->list, &c->ec_stripe_head_list);
return h;
@@ -1743,14 +1802,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data) == h->s->nr_data)
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_set_pending(c, h);
mutex_unlock(&h->lock);
}
static struct ec_stripe_head *
__bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ unsigned disk_label,
unsigned algo,
unsigned redundancy,
enum bch_watermark watermark)
@@ -1768,27 +1827,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
- goto found;
+ goto err;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
- if (h->target == target &&
+ if (h->disk_label == disk_label &&
h->algo == algo &&
h->redundancy == redundancy &&
h->watermark == watermark) {
ret = bch2_trans_mutex_lock(trans, &h->lock);
- if (ret)
+ if (ret) {
h = ERR_PTR(ret);
+ goto err;
+ }
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
+ h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
found:
- if (!IS_ERR_OR_NULL(h) &&
- h->nr_active_devs < h->redundancy + 2) {
+ if (h->rw_devs_change_count != c->rw_devs_change_count)
+ ec_stripe_head_devs_update(c, h);
+
+ if (h->insufficient_devs) {
mutex_unlock(&h->lock);
h = NULL;
}
+err:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@@ -1878,7 +1942,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
return 0;
}
-/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
struct ec_stripe_head *head)
{
@@ -1901,7 +1964,8 @@ static s64 get_existing_stripe(struct bch_fs *c,
m = genradix_ptr(&c->stripes, stripe_idx);
- if (m->algorithm == head->algo &&
+ if (m->disk_label == head->disk_label &&
+ m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
@@ -2046,9 +2110,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct ec_stripe_head *h;
bool waiting = false;
+ unsigned disk_label = 0;
+ struct target t = target_decode(target);
int ret;
- h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
+ if (t.type == TARGET_GROUP) {
+ if (t.group > U8_MAX) {
+ bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
+ return NULL;
+ }
+ disk_label = t.group + 1; /* 0 == no label */
+ }
+
+ h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
if (IS_ERR_OR_NULL(h))
return h;
@@ -2126,6 +2200,73 @@ err:
return ERR_PTR(ret);
}
+/* device removal */
+
+static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+ return -BCH_ERR_invalidate_stripe_to_dev;
+ }
+
+ struct btree_iter iter;
+ struct bkey_i_stripe *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ return ret;
+
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ s64 sectors = 0;
+ for (unsigned i = 0; i < s->v.nr_blocks; i++)
+ sectors -= stripe_blockcount_get(&s->v, i);
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == k_a.k->p.inode)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ sectors = -sectors;
+
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = BCH_DATA_user;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+{
+ return bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter,
+ BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
+ BTREE_ITER_intent, k,
+ NULL, NULL, 0, ({
+ bch2_invalidate_stripe_to_dev(trans, k);
+ })));
+}
+
+/* startup/shutdown */
+
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
@@ -2151,8 +2292,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
}
goto unlock;
found:
- h->s->err = -BCH_ERR_erofs_no_writes;
- ec_stripe_set_pending(c, h);
+ ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
unlock:
mutex_unlock(&h->lock);
}
@@ -2197,17 +2337,9 @@ int bch2_stripes_read(struct bch_fs *c)
if (ret)
break;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-
struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
bch2_stripes_heap_insert(c, m, k.k->p.offset);
0;
@@ -2252,6 +2384,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
prt_printf(out, " %u", s->blocks[i]);
prt_newline(out);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
+ prt_newline(out);
}
void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2261,9 +2395,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "target %u algo %u redundancy %u %s:\n",
- h->target, h->algo, h->redundancy,
- bch2_watermarks[h->watermark]);
+ prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
+ h->disk_label, h->algo, h->redundancy,
+ bch2_watermarks[h->watermark],
+ h->nr_created);
if (h->s)
bch2_new_stripe_to_text(out, c, h->s);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 9baf3411a8f9..43326370b410 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -188,10 +188,15 @@ struct ec_stripe_head {
struct list_head list;
struct mutex lock;
- unsigned target;
+ unsigned disk_label;
unsigned algo;
unsigned redundancy;
enum bch_watermark watermark;
+ bool insufficient_devs;
+
+ unsigned long rw_devs_change_count;
+
+ u64 nr_created;
struct bch_devs_mask devs;
unsigned nr_active_devs;
@@ -204,7 +209,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
@@ -249,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
index 44ce88ba08d7..64ef52e00078 100644
--- a/fs/bcachefs/ec_format.h
+++ b/fs/bcachefs/ec_format.h
@@ -11,7 +11,14 @@ struct bch_stripe {
__u8 csum_granularity_bits;
__u8 csum_type;
- __u8 pad;
+
+ /*
+ * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2
+ *
+ * we can manage with this because this only needs to point to a
+ * disk label, not a target:
+ */
+ __u8 disk_label;
struct bch_extent_ptr ptrs[];
} __packed __aligned(8);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 1df03dccfc72..8d1e70e830ac 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -16,6 +16,7 @@ struct stripe {
u8 nr_blocks;
u8 nr_redundant;
u8 blocks_nonempty;
+ u8 disk_label;
};
struct gc_stripe {
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 742dcdd3e5d7..60b7875adada 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -119,8 +119,8 @@
x(EEXIST, EEXIST_str_hash_set) \
x(EEXIST, EEXIST_discard_in_flight_add) \
x(EEXIST, EEXIST_subvolume_create) \
- x(0, open_buckets_empty) \
- x(0, freelist_empty) \
+ x(ENOSPC, open_buckets_empty) \
+ x(ENOSPC, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
x(0, transaction_restart) \
x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \
@@ -244,6 +244,16 @@
x(EIO, btree_node_read_error) \
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
+ x(EIO, bucket_ref_update) \
+ x(EIO, trigger_pointer) \
+ x(EIO, trigger_stripe_pointer) \
+ x(EIO, metadata_bucket_inconsistency) \
+ x(EIO, mark_stripe) \
+ x(EIO, stripe_reconstruct) \
+ x(EIO, key_type_error) \
+ x(EIO, no_device_to_read_from) \
+ x(EIO, missing_indirect_extent) \
+ x(EIO, invalidate_stripe_to_dev) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 95afa7bf2020..3a16b535b6c3 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c,
if (!c)
c = trans->c;
- WARN_ON(!trans && bch2_current_has_btree_trans(c));
+ /*
+ * Ugly: if there's a transaction in the current task it has to be
+ * passed in to unlock if we prompt for user input.
+ *
+ * But, plumbing a transaction and transaction restarts into
+ * bkey_validate() is problematic.
+ *
+ * So:
+ * - make all bkey errors AUTOFIX, they're simple anyways (we just
+ * delete the key)
+ * - and we don't need to warn if we're not prompting
+ */
+ WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 2f1b86978f36..21ee7211b03e 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -184,7 +184,7 @@ do { \
ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \
} \
- int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \
+ int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 324303bf4353..cc0d22085aef 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
if (k.k->type == KEY_TYPE_error)
- return -EIO;
+ return -BCH_ERR_key_type_error;
rcu_read_lock();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
* read:
*/
if (!ret && !p.ptr.cached)
- ret = -EIO;
+ ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
@@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
? f->idx
: f->idx + 1;
- if (!p.idx && !ca)
+ if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
p.idx++;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
- if (!p.idx && !bch2_dev_is_readable(ca))
- p.idx++;
-
- if (p.idx >= (unsigned) p.has_ec + 1)
+ if (p.idx > (unsigned) p.has_ec)
continue;
if (ret > 0 && !ptr_better(c, p, *pick))
@@ -821,6 +818,18 @@ void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
{
+ if (k.k->type != KEY_TYPE_stripe) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == ptr->dev && p.has_ec) {
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+ return;
+ }
+ }
+
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
bch2_bkey_drop_ptr_noerror(k, ptr);
@@ -848,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
-
- if (ptr)
- bch2_bkey_drop_ptr_noerror(k, ptr);
+ bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
}
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
@@ -1021,7 +1027,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
{
out->atomic++;
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
@@ -1125,8 +1131,9 @@ static int extent_ptr_validate(struct bch_fs *c,
{
int ret = 0;
+ /* bad pointers are repaired by check_fix_ptrs(): */
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
rcu_read_unlock();
return 0;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 42a7c6d820a0..ed5001dd662e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -357,7 +357,7 @@ out: \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
-#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+#define bkey_crc_next(_k, _end, _crc, _iter) \
({ \
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
if (extent_entry_is_crc(_iter)) { \
@@ -372,7 +372,7 @@ out: \
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
(_iter) = (_start); \
- bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ bkey_crc_next(_k, _end, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
@@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d
unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
@@ -652,6 +649,23 @@ void bch2_extent_ptr_decoded_append(struct bkey_i *,
void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *);
void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+
+#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \
+do { \
+ __label__ _again; \
+ struct bkey_ptrs _ptrs; \
+_again: \
+ _ptrs = bch2_bkey_ptrs(_k); \
+ \
+ bkey_for_each_ptr(_ptrs, _ptr) \
+ if (_cond) { \
+ bch2_bkey_drop_ptr_noerror(_k, _ptr); \
+ goto _again; \
+ } \
+} while (0)
+
#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
do { \
__label__ _again; \
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 508d029ac53d..7e10a9ddcfd9 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
if (ret)
goto err;
@@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- STR_HASH_must_create);
+ STR_HASH_must_create|BTREE_ITER_with_updates);
if (ret)
goto err;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index ff60c041abe5..48a1ab9a649b 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
- u32 snapshot;
int ret = 0;
rbio->c = c;
@@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
-retry:
bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -189,7 +182,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -200,7 +193,7 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -210,7 +203,7 @@ retry:
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
- break;
+ goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@ -229,17 +222,13 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
-err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
if (ret) {
bch_err_inum_offset_ratelimited(c,
iter.pos.inode,
@@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
+ op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index e246b1e05aa2..ee1c0325f313 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_subvol;
+ dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index a9cc5cad9cc9..af3a24546aa3 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
struct folio **fs, unsigned nr_folios)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_folio *s;
u64 offset = folio_sector(fs[0]);
- unsigned folio_idx;
- u32 snapshot;
bool need_set = false;
- int ret;
- for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+ for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+ struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
if (!s)
return -ENOMEM;
@@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
if (!need_set)
return 0;
- folio_idx = 0;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_slots, k, ret) {
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = fs[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
- folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) -
- folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_put(trans);
+ unsigned folio_idx = 0;
+
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inum.inum, offset),
+ POS(inum.inum, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k);
+
+ while (folio_idx < nr_folios) {
+ struct folio *folio = fs[folio_idx];
+ u64 folio_start = folio_sector(folio);
+ u64 folio_end = folio_end_sector(folio);
+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+ folio_start;
+ unsigned folio_len = min(k.k->p.offset, folio_end) -
+ folio_offset - folio_start;
+
+ BUG_ON(k.k->p.offset < folio_start);
+ BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+ if (!bch2_folio(folio)->uptodate)
+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+ if (k.k->p.offset < folio_end)
+ break;
+ folio_idx++;
+ }
- return ret;
+ if (folio_idx == nr_folios)
+ break;
+ 0;
+ })));
}
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index fd7d692c087e..fad911cf5068 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
- return folio_has_private(folio)
- ? (struct bch_folio *) folio_get_private(folio)
- : NULL;
+ return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 77b85da30fb2..71d0fa387509 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
- if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
- ret = 1;
- break;
- }
- start = iter.pos;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+ subvol, 0, k, ({
+ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+ })));
}
static int __bch2_truncate_folio(struct bch_inode_info *inode,
@@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* folio
*/
- ret = range_has_data(c, inode->ei_subvol,
+ ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
@@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans,
- inode->ei_subvol, &snapshot);
+ inode->ei_inum.subvol, &snapshot);
if (ret)
goto bkey_err;
@@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot;
u64 sectors = end - start;
- u64 pos = start;
- int ret;
-retry:
- bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, pos, snapshot), 0);
-
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
- !(ret = bkey_err(k))) {
- if (bkey_extent_is_allocation(k.k)) {
- u64 s = min(end, k.k->p.offset) -
- max(start, bkey_start_offset(k.k));
- BUG_ON(s > sectors);
- sectors -= s;
- }
- bch2_btree_iter_advance(&iter);
- }
- pos = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter,
+ BTREE_ID_extents,
+ POS(inode->v.i_ino, start),
+ POS(inode->v.i_ino, end - 1),
+ inode->ei_inum.subvol, 0, k, ({
+ if (bkey_extent_is_allocation(k.k)) {
+ u64 s = min(end, k.k->p.offset) -
+ max(start, bkey_start_offset(k.k));
+ BUG_ON(s > sectors);
+ sectors -= s;
+ }
+
+ 0;
+ })));
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
@@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- POS(inode->v.i_ino, U64_MAX),
- 0, k, ret) {
- if (bkey_extent_is_data(k.k)) {
- next_data = max(offset, bkey_start_offset(k.k) << 9);
- break;
- } else if (k.k->p.offset >> 9 > isize)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, 0, k, ({
+ if (bkey_extent_is_data(k.k)) {
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
+ break;
+ } else if (k.k->p.offset >> 9 > isize)
+ break;
+ 0;
+ })));
if (ret)
return ret;
@@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
- u32 snapshot;
- int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
- trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
- SPOS(inode->v.i_ino, offset >> 9, snapshot),
- BTREE_ITER_slots, k, ret) {
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+ POS(inode->v.i_ino, offset >> 9),
+ POS(inode->v.i_ino, U64_MAX),
+ inum.subvol, BTREE_ITER_slots, k, ({
+ if (k.k->p.inode != inode->v.i_ino) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ offset, MAX_LFS_FILESIZE, 0, false);
break;
- } else {
- offset = max(offset, bkey_start_offset(k.k) << 9);
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ } else if (!bkey_extent_is_data(k.k)) {
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
+ max(offset, bkey_start_offset(k.k) << 9),
+ k.k->p.offset << 9, 0, false);
+
+ if (next_hole < k.k->p.offset << 9)
+ break;
+ } else {
+ offset = max(offset, bkey_start_offset(k.k) << 9);
+ }
+ 0;
+ })));
if (ret)
return ret;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 99c7fe987c74..405cf08bda34 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
@@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 011817afc3ad..4a1bb07a2574 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,7 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
- "%s: inode %u:%llu not found when updating",
+ "%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@@ -152,50 +152,106 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static int bch2_iget5_test(struct inode *vinode, void *p)
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
-
- return inode->ei_subvol == inum->subvol &&
- inode->ei_inode.bi_inum == inum->inum;
+ return a.subvol == b.subvol && a.inum == b.inum;
}
-static int bch2_iget5_set(struct inode *vinode, void *p)
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+ const void *obj)
{
- struct bch_inode_info *inode = to_bch_ei(vinode);
- subvol_inum *inum = p;
+ const struct bch_inode_info *inode = obj;
+ const subvol_inum *v = arg->key;
- inode->v.i_ino = inum->inum;
- inode->ei_subvol = inum->subvol;
- inode->ei_inode.bi_inum = inum->inum;
- return 0;
+ return !subvol_inum_eq(inode->ei_inum, *v);
}
-static unsigned bch2_inode_hash(subvol_inum inum)
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+ .head_offset = offsetof(struct bch_inode_info, hash),
+ .key_offset = offsetof(struct bch_inode_info, ei_inum),
+ .key_len = sizeof(subvol_inum),
+ .obj_cmpfn = bch2_vfs_inode_cmp_fn,
+ .automatic_shrinking = true,
+};
+
+static void __wait_on_freeing_inode(struct inode *inode)
{
- return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
}
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
- return to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
+}
+
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+ subvol_inum inum)
+{
+ struct bch_inode_info *inode;
+repeat:
+ inode = __bch2_inode_hash_find(c, inum);
+ if (inode) {
+ spin_lock(&inode->v.i_lock);
+ if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+ spin_unlock(&inode->v.i_lock);
+ return NULL;
+ }
+ if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+ if (!trans) {
+ __wait_on_freeing_inode(&inode->v);
+ } else {
+ bch2_trans_unlock(trans);
+ __wait_on_freeing_inode(&inode->v);
+ int ret = bch2_trans_relock(trans);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ goto repeat;
+ }
+ __iget(&inode->v);
+ spin_unlock(&inode->v.i_lock);
+ }
+
+ return inode;
+}
+
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ spin_lock(&inode->v.i_lock);
+ bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+ spin_unlock(&inode->v.i_lock);
+
+ if (remove) {
+ int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+ &inode->hash, bch2_vfs_inodes_params);
+ BUG_ON(ret);
+ inode->v.i_hash.pprev = NULL;
+ }
}
-static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct bch_inode_info *inode)
{
- subvol_inum inum = inode_inum(inode);
- struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- bch2_iget5_set,
- &inum));
- BUG_ON(!old);
+ struct bch_inode_info *old = inode;
+
+ set_bit(EI_INODE_HASHED, &inode->ei_flags);
+retry:
+ if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+ &inode->hash,
+ bch2_vfs_inodes_params))) {
+ old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+ if (!old)
+ goto retry;
+
+ clear_bit(EI_INODE_HASHED, &inode->ei_flags);
- if (unlikely(old != inode)) {
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But
@@ -209,21 +265,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
*/
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
- inode = old;
+ return old;
} else {
+ inode_fake_hash(&inode->v);
+
+ inode_sb_list_add(&inode->v);
+
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
- /*
- * Again, I_NEW makes no sense for bcachefs. This is only needed
- * for clearing I_NEW, but since the inode was already fully
- * created and initialized we didn't actually want
- * inode_insert5() to set it for us.
- */
- unlock_new_inode(&inode->v);
+ return inode;
}
-
- return inode;
}
#define memalloc_flags_do(_flags, _do) \
@@ -241,7 +293,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
{
- struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+ bch2_inode_cache, GFP_NOFS);
if (!inode)
return NULL;
@@ -283,13 +336,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
return inode;
}
+static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
+{
+ struct bch_inode_info *inode = bch2_new_inode(trans);
+ if (IS_ERR(inode))
+ return inode;
+
+ bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+
+ return bch2_inode_hash_insert(trans->c, trans, inode);
+
+}
+
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
if (inode)
return &inode->v;
@@ -300,11 +364,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
int ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
- PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
- if (!ret) {
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
- }
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_trans_put(trans);
return ret ? ERR_PTR(ret) : &inode->v;
@@ -325,6 +385,8 @@ __bch2_create(struct mnt_idmap *idmap,
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
/*
@@ -351,13 +413,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
- ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+ kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+ kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
+ ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
- from_kuid(i_user_ns(&dir->v), current_fsuid()),
- from_kgid(i_user_ns(&dir->v), current_fsgid()),
+ from_kuid(i_user_ns(&dir->v), kuid),
+ from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -365,7 +429,7 @@ retry:
if (unlikely(ret))
goto err_before_quota;
- inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
@@ -395,8 +459,16 @@ err_before_quota:
* we must insert the new inode into the inode cache before calling
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
+ *
+ * also, calling bch2_inode_hash_insert() without passing in the
+ * transaction object is sketchy - if we could ever end up in
+ * __wait_on_freeing_inode(), we'd risk deadlock.
+ *
+ * But that shouldn't be possible, since we still have the inode locked
+ * that we just created, and we _really_ can't take a transaction
+ * restart here.
*/
- inode = bch2_inode_insert(c, inode);
+ inode = bch2_inode_hash_insert(c, NULL, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@@ -436,11 +508,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
goto err;
- struct bch_inode_info *inode =
- to_bch_ei(ilookup5_nowait(c->vfs_sb,
- bch2_inode_hash(inum),
- bch2_iget5_test,
- &inum));
+ struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
if (inode)
goto out;
@@ -448,7 +516,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
- PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
c, "dirent to missing inode:\n %s",
@@ -468,9 +536,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
ret = -ENOENT;
goto err;
}
-
- bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
- inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -557,8 +622,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
- bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
return bch2_err_class(ret);
@@ -614,7 +679,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret);
}
@@ -671,15 +736,16 @@ static int bch2_rename2(struct mnt_idmap *idmap,
struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
- struct bch_inode_unpacked src_inode_u, dst_inode_u;
+ struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
+ bool whiteout = !!(flags & RENAME_WHITEOUT);
int ret;
- if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
return -EINVAL;
if (mode == BCH_RENAME_OVERWRITE) {
@@ -697,8 +763,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
trans = bch2_trans_get(c);
- ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
- bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+ ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+ bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
goto err;
@@ -720,18 +786,48 @@ static int bch2_rename2(struct mnt_idmap *idmap,
if (ret)
goto err;
}
+retry:
+ bch2_trans_begin(trans);
- ret = commit_do(trans, NULL, NULL, 0,
- bch2_rename_trans(trans,
- inode_inum(src_dir), &src_dir_u,
- inode_inum(dst_dir), &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode));
+ ret = bch2_rename_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode);
if (unlikely(ret))
+ goto err_tx_restart;
+
+ if (whiteout) {
+ whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+ ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ bch2_inode_init_early(c, whiteout_inode_u);
+
+ ret = bch2_create_trans(trans,
+ inode_inum(src_dir), &src_dir_u,
+ whiteout_inode_u,
+ &src_dentry->d_name,
+ from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+ from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+ S_IFCHR|WHITEOUT_MODE, 0,
+ NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+ bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+ KEY_TYPE_QUOTA_PREALLOC);
+ if (unlikely(ret))
+ goto err_tx_restart;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (unlikely(ret)) {
+err_tx_restart:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
goto err;
+ }
BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
BUG_ON(dst_inode &&
@@ -779,11 +875,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
+ kuid_t kuid;
+ kgid_t kgid;
- if (ia_valid & ATTR_UID)
- bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
- if (ia_valid & ATTR_GID)
- bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+ }
+ if (ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+ }
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@@ -798,11 +900,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
- ? attr->ia_gid
+ ? kgid
: inode->v.i_gid;
- if (!in_group_p(gid) &&
- !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+ if (!in_group_or_capable(idmap, &inode->v,
+ make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@@ -818,17 +920,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
+ kuid_t kuid;
+ kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
- if (attr->ia_valid & ATTR_UID)
- qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+ if (attr->ia_valid & ATTR_UID) {
+ kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+ }
- if (attr->ia_valid & ATTR_GID)
- qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+ if (attr->ia_valid & ATTR_GID) {
+ kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+ }
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@@ -884,13 +992,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
- stat->uid = inode->v.i_uid;
- stat->gid = inode->v.i_gid;
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@@ -899,7 +1009,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
- stat->subvol = inode->ei_subvol;
+ stat->subvol = inode->ei_inum.subvol;
stat->result_mask |= STATX_SUBVOL;
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@@ -941,7 +1051,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem);
- ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr);
if (ret)
return ret;
@@ -1034,7 +1144,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bkey_buf cur, prev;
unsigned offset_into_extent, sectors;
bool have_extent = false;
- u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -1050,21 +1159,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
- if (ret)
- goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(ei->v.i_ino, start, snapshot), 0);
+ POS(ei->v.i_ino, start), 0);
- while (!(ret = btree_trans_too_many_iters(trans)) &&
- (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
- !(ret = bkey_err(k))) {
+ while (true) {
enum btree_id data_btree = BTREE_ID_extents;
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_upto(&iter, end);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k)
+ break;
+
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(&iter);
@@ -1108,16 +1226,12 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-
- ret = bch2_trans_relock(trans);
- if (ret)
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
- start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
if (!ret && have_extent) {
bch2_trans_unlock(trans);
@@ -1173,7 +1287,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+ int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret)
return ret;
}
@@ -1305,8 +1419,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{
return (struct bcachefs_fid) {
- .inum = inode->ei_inode.bi_inum,
- .subvol = inode->ei_subvol,
+ .inum = inode->ei_inum.inum,
+ .subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation,
};
}
@@ -1391,7 +1505,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?:
- inode->ei_subvol,
+ inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir,
};
@@ -1427,7 +1541,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry:
bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret)
goto err;
@@ -1458,8 +1572,7 @@ retry:
if (ret)
goto err;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
} else {
/*
@@ -1480,8 +1593,7 @@ retry:
if (ret)
continue;
- if (target.subvol == inode->ei_subvol &&
- target.inum == inode->ei_inode.bi_inum)
+ if (subvol_inum_eq(target, inode->ei_inum))
goto found;
}
}
@@ -1513,12 +1625,15 @@ static const struct export_operations bch_export_ops = {
.get_name = bch2_get_name,
};
-static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans,
+ subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
- bch2_iget5_set(&inode->v, &inum);
+ inode->v.i_ino = inum.inum;
+ inode->ei_inum = inum;
+ inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors;
@@ -1530,7 +1645,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
- inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@@ -1597,6 +1711,17 @@ static void bch2_evict_inode(struct inode *vinode)
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
+ bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
+
+ /*
+ * evict() has waited for outstanding writeback, we'll do no more IO
+ * through this inode: it's safe to remove from VFS inode hashtable here
+ *
+ * Do that now so that other threads aren't blocked from pulling it back
+ * in, there's no reason for them to be:
+ */
+ if (!delete)
+ bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);
@@ -1604,12 +1729,18 @@ static void bch2_evict_inode(struct inode *vinode)
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
- if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+ if (delete) {
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode_inum(inode));
+
+ /*
+ * If we are deleting, we need it present in the vfs hash table
+ * so that fsck can check if unlinked inodes are still open:
+ */
+ bch2_inode_hash_remove(c, inode);
}
mutex_lock(&c->vfs_inodes_lock);
@@ -1639,7 +1770,7 @@ again:
mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
- if (!snapshot_list_has_id(s, inode->ei_subvol))
+ if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue;
if (!(inode->v.i_state & I_DONTCACHE) &&
@@ -1801,30 +1932,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
static int bch2_show_options(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- enum bch_opt_id i;
struct printbuf buf = PRINTBUF;
- int ret = 0;
- for (i = 0; i < bch2_opts_nr; i++) {
- const struct bch_option *opt = &bch2_opt_table[i];
- u64 v = bch2_opt_get_by_id(&c->opts, i);
+ bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+ OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+ printbuf_nul_terminate(&buf);
+ seq_puts(seq, buf.buf);
- if ((opt->flags & OPT_HIDDEN) ||
- !(opt->flags & OPT_MOUNT))
- continue;
-
- if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
- continue;
-
- printbuf_reset(&buf);
- bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
- OPT_SHOW_MOUNT_STYLE);
- seq_putc(seq, ',');
- seq_puts(seq, buf.buf);
- }
-
- if (buf.allocation_failure)
- ret = -ENOMEM;
+ int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
return ret;
}
@@ -2129,12 +2244,23 @@ static int bch2_init_fs_context(struct fs_context *fc)
return 0;
}
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+ if (c->vfs_inodes_table.tbl)
+ rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+}
+
static struct file_system_type bcache_fs_type = {
.owner = THIS_MODULE,
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");
@@ -2149,7 +2275,8 @@ int __init bch2_vfs_init(void)
{
int ret = -ENOMEM;
- bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT);
if (!bch2_inode_cache)
goto err;
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 990ec43e0365..da74ecc236e7 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -13,6 +13,9 @@
struct bch_inode_info {
struct inode v;
+ struct rhash_head hash;
+ subvol_inum ei_inum;
+
struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
@@ -24,8 +27,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
- u32 ei_subvol;
-
/*
* When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices
@@ -50,10 +51,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
- return (subvol_inum) {
- .subvol = inode->ei_subvol,
- .inum = inode->ei_inode.bi_inum,
- };
+ return inode->ei_inum;
}
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
@@ -69,6 +67,7 @@ struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
* those:
*/
#define EI_INODE_SNAPSHOT 1
+#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@@ -189,6 +188,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
+
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@@ -203,6 +205,10 @@ static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, su
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9b3470a97546..0d8b782b63fb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -21,6 +21,49 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ if (d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
+ return 0;
+ return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+}
+
+static void dirent_inode_mismatch_msg(struct printbuf *out,
+ struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ prt_str(out, "inode points to dirent that does not point back:");
+ prt_newline(out);
+ bch2_bkey_val_to_text(out, c, dirent.s_c);
+ prt_newline(out);
+ bch2_inode_unpacked_to_text(out, inode);
+}
+
+static int dirent_points_to_inode(struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ dirent_inode_mismatch_msg(&buf, c, dirent, inode);
+ bch_warn(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
+}
+
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -346,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
+ if (!inode->bi_dir)
+ return 0;
- d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0,
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
dirent);
- ret = bkey_err(d) ?:
+ int ret = bkey_err(d) ?:
+ dirent_points_to_inode(c, d, inode) ?:
__remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -371,7 +417,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
ret = remove_backpointer(trans, &inode);
- bch_err_msg(c, ret, "removing dirent");
+ if (!bch2_err_matches(ret, ENOENT))
+ bch_err_msg(c, ret, "removing dirent");
if (ret)
return ret;
@@ -626,12 +673,12 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
- bool seen_this_pos;
u64 count;
};
struct inode_walker {
bool first_this_inode;
+ bool have_inodes;
bool recalculate_sums;
struct bpos last_pos;
@@ -669,6 +716,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ /*
+ * We no longer have inodes for w->last_pos; clear this to avoid
+ * screwing up check_i_sectors/check_subdir_count if we take a
+ * transaction restart here:
+ */
+ w->have_inodes = false;
w->recalculate_sums = false;
w->inodes.nr = 0;
@@ -686,6 +739,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
+ w->have_inodes = true;
return 0;
}
@@ -740,9 +794,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, k.k->p)) {
- darray_for_each(w->inodes, i)
- i->seen_this_pos = false;
}
w->last_pos = k.k->p;
@@ -896,21 +947,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
}
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
@@ -920,13 +956,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
return ret;
}
-static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+static int check_inode_dirent_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
- u32 inode_snapshot, bool *write_inode)
+ bool *write_inode)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ u32 inode_snapshot = inode->bi_snapshot;
struct btree_iter dirent_iter = {};
struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
int ret = bkey_err(d);
@@ -936,13 +973,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
- fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
trans, inode_points_to_wrong_dirent,
- "inode points to dirent that does not point back:\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ "%s",
+ (printbuf_reset(&buf),
+ dirent_inode_mismatch_msg(&buf, c, d, inode),
+ buf.buf))) {
/*
* We just clear the backpointer fields for now. If we find a
* dirent that points to this inode in check_dirents(), we'll
@@ -963,7 +1000,7 @@ fsck_err:
return ret;
}
-static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
+static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
@@ -972,7 +1009,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
/* snapshot tree corruption, can't safely delete */
if (!inum.subvol) {
- bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
+ bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
}
@@ -1045,30 +1082,44 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_flags & BCH_INODE_unlinked) {
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- return ret;
+ if (!test_bit(BCH_FS_started, &c->flags)) {
+ /*
+ * If we're not in online fsck, don't delete unlinked
+ * inodes, just make sure they're on the deleted list.
+ *
+ * They might be referred to by a logged operation -
+ * i.e. we might have crashed in the middle of a
+ * truncate on an unlinked but open file - so we want to
+ * let the delete_dead_inodes kill it after resuming
+ * logged ops.
+ */
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
- fsck_err_on(!ret,
- trans, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
- ret = 0;
- }
+ fsck_err_on(!ret,
+ trans, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
- if (u.bi_flags & BCH_INODE_unlinked &&
- !bch2_inode_open(c, k.k->p) &&
- (!c->sb.clean ||
- fsck_err(trans, inode_unlinked_but_clean,
- "filesystem marked clean, but inode %llu unlinked",
- u.bi_inum))) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- return ret;
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
+ if (ret)
+ goto err;
+ } else {
+ if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
+ trans, inode_unlinked_and_not_open,
+ "inode %llu%u unlinked and not open",
+ u.bi_inum, u.bi_snapshot)) {
+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck deleting inode");
+ return ret;
+ }
+ }
}
+ /* i_size_dirty is vestigal, since we now have logged ops for truncate * */
if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
@@ -1097,8 +1148,9 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
@@ -1126,7 +1178,7 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_dir || u.bi_dir_offset) {
- ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
}
@@ -1555,10 +1607,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
struct snapshots_seen *s,
- struct extent_ends *extent_ends)
+ struct extent_ends *extent_ends,
+ struct disk_reservation *res)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1568,7 +1620,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
}
- if (inode->last_pos.inode != k.k->p.inode) {
+ if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
ret = check_i_sectors(trans, inode);
if (ret)
goto err;
@@ -1578,12 +1630,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(i);
+ struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
+ ret = PTR_ERR_OR_ZERO(extent_i);
if (ret)
goto err;
- ret = check_key_has_inode(trans, iter, inode, i, k);
+ ret = check_key_has_inode(trans, iter, inode, extent_i, k);
if (ret)
goto err;
@@ -1592,24 +1644,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums);
if (ret)
goto err;
- }
- /*
- * Check inodes in reverse order, from oldest snapshots to newest,
- * starting from the inode that matches this extent's snapshot. If we
- * didn't have one, iterate over all inodes:
- */
- if (!i)
- i = &darray_last(inode->inodes);
-
- for (;
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
- continue;
+ /*
+ * Check inodes in reverse order, from oldest snapshots to
+ * newest, starting from the inode that matches this extent's
+ * snapshot. If we didn't have one, iterate over all inodes:
+ */
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
- if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
@@ -1629,13 +1676,25 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
iter->k.type = KEY_TYPE_whiteout;
+ break;
}
-
- if (bkey_extent_is_allocation(k.k))
- i->count += k.k->size;
}
+ }
- i->seen_this_pos = true;
+ ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (bkey_extent_is_allocation(k.k)) {
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
+
+ i->count += k.k->size;
+ }
}
if (k.k->type != KEY_TYPE_whiteout) {
@@ -1666,13 +1725,11 @@ int bch2_check_extents(struct bch_fs *c)
extent_ends_init(&extent_ends);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors_notnested(trans, &w));
@@ -1758,6 +1815,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (inode_points_to_dirent(target, d))
@@ -1770,7 +1828,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
prt_printf(&buf, "\n "),
bch2_inode_unpacked_to_text(&buf, target),
buf.buf)))
- goto out_noiter;
+ goto err;
if (!target->bi_dir &&
!target->bi_dir_offset) {
@@ -1779,7 +1837,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- struct btree_iter bp_iter = { NULL };
struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
ret = bkey_err(bp_dirent);
@@ -1840,7 +1897,6 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
-out_noiter:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -2075,7 +2131,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (dir->last_pos.inode != k.k->p.inode) {
+ if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
ret = check_subdir_count(trans, dir);
if (ret)
goto err;
@@ -2137,11 +2193,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
- i->count++;
}
+
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
out:
err:
fsck_err:
@@ -2164,12 +2224,9 @@ int bch2_check_dirents(struct bch_fs *c)
snapshots_seen_init(&s);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
check_subdir_count_notnested(trans, &dir));
@@ -2314,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v)
return false;
}
-/*
- * We've checked that inode backpointers point to valid dirents; here, it's
- * sufficient to check that the subvolume root has a dirent:
- */
-static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
- struct bch_inode_unpacked inode;
- int ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &inode);
- if (ret)
- return ret;
-
- return inode.bi_dir != 0;
-}
-
static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -2348,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- ret = subvol_has_dirent(trans, s);
- if (ret < 0)
+ struct bch_inode_unpacked subvol_root;
+ ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &subvol_root);
+ if (ret)
break;
- if (fsck_err_on(!ret,
+ /*
+ * We've checked that inode backpointers point to valid dirents;
+ * here, it's sufficient to check that the subvolume root has a
+ * dirent:
+ */
+ if (fsck_err_on(!subvol_root.bi_dir,
trans, subvol_unreachable,
"unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c),
+ prt_newline(&buf),
+ bch2_inode_unpacked_to_text(&buf, &subvol_root),
buf.buf))) {
ret = reattach_subvol(trans, s);
break;
@@ -2450,10 +2501,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, &inode)) {
+ if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- }
if (bch2_err_matches(ret, ENOENT)) {
ret = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2be6be33afa3..753c208896c3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- if (likely(k.k->type == KEY_TYPE_inode_v3))
- return bch2_inode_unpack_v3(k, unpacked);
- return bch2_inode_unpack_slowpath(k, unpacked);
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
+ return likely(k.k->type == KEY_TYPE_inode_v3)
+ ? bch2_inode_unpack_v3(k, unpacked)
+ : bch2_inode_unpack_slowpath(k, unpacked);
}
int bch2_inode_peek_nowarn(struct btree_trans *trans,
@@ -365,7 +367,7 @@ int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags)
{
int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+ bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
return ret;
}
@@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- prt_printf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
__bch2_inode_unpacked_to_text(out, inode);
}
@@ -1111,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot))
goto delete;
- if (c->sb.clean &&
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f1fcb4c58039..695abd707cb6 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -69,6 +69,7 @@ typedef u64 u96;
struct bch_inode_unpacked {
u64 bi_inum;
+ u32 bi_snapshot;
u64 bi_journal_seq;
__le64 bi_hash_seed;
u64 bi_size;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 7ee3b75480df..e4fc17c548fd 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
*/
bool promote_full = (failed ||
*read_full ||
- READ_ONCE(c->promote_whole_extents));
+ READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if ((ret = bkey_err(k)))
goto out;
- if (bversion_cmp(k.k->version, rbio->version) ||
+ if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
orig_k->k->k.size,
reflink_offset);
bch2_inconsistent_error(trans->c);
- ret = -EIO;
+ ret = -BCH_ERR_missing_indirect_extent;
goto err;
}
@@ -869,9 +869,15 @@ retry_pick:
goto hole;
if (pick_ret < 0) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
bch_err_inum_offset_ratelimited(c,
read_pos.inode, read_pos.offset << 9,
- "no device to read from");
+ "no device to read from: %s\n %s",
+ bch2_err_str(pick_ret),
+ buf.buf);
+ printbuf_exit(&buf);
goto err;
}
@@ -1025,7 +1031,7 @@ get_bio:
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
- rbio->version = k.k->version;
+ rbio->version = k.k->bversion;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
@@ -1086,7 +1092,7 @@ get_bio:
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
- if (bch2_ec_read_extent(trans, rbio)) {
+ if (bch2_ec_read_extent(trans, rbio, k)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
@@ -1214,10 +1220,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
- goto err;
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 1d4761d15002..b5fe9e0dc155 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
- e->k.version = version;
+ e->k.bversion = version;
if (crc.csum_type ||
crc.compression_type ||
@@ -1447,9 +1447,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
+ &op->cl, &wp));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
@@ -1546,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
- id->k.version = op->version;
+ id->k.bversion = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
@@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7664b68e6a15..954f6a96e0f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
goto out;
}
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+ if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) {
@@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
+ .e.nr_devs = 0,
.e.nr_required = 1,
};
@@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
goto err;
darray_for_each(i->ptrs, ptr)
- replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);
@@ -1950,7 +1951,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
if (error ||
w->noflush ||
(!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ time_before(jiffies, j->last_flush_write +
+ msecs_to_jiffies(c->opts.journal_flush_delay)) &&
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 70b998d9f19c..ace291f175dd 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
- if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
+ if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- atomic_read(&c->btree_cache.dirty),
- c->btree_cache.used,
+ atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index f49fdca1d07d..6f4a4e1083c9 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
+ trans, logged_op_but_clean,
+ "filesystem marked as clean but have logged op\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf));
if (!fn)
return 0;
@@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
-
- return trans_was_restarted(trans, restart_count);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e10fc1da71b1..232be8a44051 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = U64_MAX, \
.choices = _choices
+#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
@@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c,
*res = ret;
break;
+ case BCH_OPT_BITFIELD: {
+ s64 v = bch2_read_flag_list(val, opt->choices);
+ if (v < 0)
+ return v;
+ *res = v;
+ break;
+ }
case BCH_OPT_FN:
ret = opt->fn.parse(c, val, res, err);
@@ -423,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out,
else
prt_str(out, opt->choices[v]);
break;
+ case BCH_OPT_BITFIELD:
+ prt_bitflags(out, opt->choices, v);
+ break;
case BCH_OPT_FN:
opt->fn.to_text(out, c, sb, v);
break;
@@ -431,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out,
}
}
+void bch2_opts_to_text(struct printbuf *out,
+ struct bch_opts opts,
+ struct bch_fs *c, struct bch_sb *sb,
+ unsigned show_mask, unsigned hide_mask,
+ unsigned flags)
+{
+ bool first = true;
+
+ for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) {
+ const struct bch_option *opt = &bch2_opt_table[i];
+
+ if ((opt->flags & hide_mask) || !(opt->flags & show_mask))
+ continue;
+
+ u64 v = bch2_opt_get_by_id(&opts, i);
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
+ continue;
+
+ if (!first)
+ prt_char(out, ',');
+ first = false;
+
+ bch2_opt_to_text(out, c, sb, opt, v, flags);
+ }
+}
+
int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
{
int ret = 0;
@@ -608,10 +646,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
return 0;
}
-void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+struct bch_dev_sb_opt_set {
+ void (*set_sb)(struct bch_member *, u64);
+};
+
+static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
+#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
+ BCH_DEV_OPT_SETTERS()
+#undef x
+};
+
+void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
+ enum bch_opt_id id = opt - bch2_opt_table;
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -619,16 +667,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = ilog2(v);
- opt->set_sb(sb, v);
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+ v++;
+
+ if (opt->flags & OPT_FS) {
+ if (opt->set_sb != SET_BCH2_NO_SB_OPT)
+ opt->set_sb(sb, v);
+ }
+
+ if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+ if (WARN(!bch2_member_exists(sb, dev_idx),
+ "tried to set device option %s on nonexistent device %i",
+ opt->attr.name, dev_idx))
+ return;
+
+ struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+
+ const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
+ if (set->set_sb)
+ set->set_sb(m, v);
+ else
+ pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+ }
}
-void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_option *opt, u64 v)
{
- if (opt->set_sb == SET_BCH2_NO_SB_OPT)
- return;
-
mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+ __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cda1725702ea..cb2e244a2429 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -53,23 +53,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
- OPT_FS = (1 << 0), /* Filesystem option */
- OPT_DEVICE = (1 << 1), /* Device option */
- OPT_INODE = (1 << 2), /* Inode option */
- OPT_FORMAT = (1 << 3), /* May be specified at format time */
- OPT_MOUNT = (1 << 4), /* May be specified at mount time */
- OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
- OPT_HUMAN_READABLE = (1 << 6),
- OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
- OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
- OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
- OPT_HIDDEN = (1 << 10),
+ OPT_FS = BIT(0), /* Filesystem option */
+ OPT_DEVICE = BIT(1), /* Device option */
+ OPT_INODE = BIT(2), /* Inode option */
+ OPT_FORMAT = BIT(3), /* May be specified at format time */
+ OPT_MOUNT = BIT(4), /* May be specified at mount time */
+ OPT_RUNTIME = BIT(5), /* May be specified at runtime */
+ OPT_HUMAN_READABLE = BIT(6),
+ OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
+ OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
+ OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
+ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
+ OPT_HIDDEN = BIT(11),
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
BCH_OPT_STR,
+ BCH_OPT_BITFIELD,
BCH_OPT_FN,
};
@@ -263,6 +265,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
+ x(promote_whole_extents, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
+ NULL, "Promote whole extents, instead of just part being read")\
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
@@ -366,6 +373,16 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
+ x(recovery_passes, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to run explicitly") \
+ x(recovery_passes_exclude, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BITFIELD(bch2_recovery_passes), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Recovery passes to exclude") \
x(recovery_pass_last, u8, \
OPT_FS|OPT_MOUNT, \
OPT_STR_NOLIMIT(bch2_recovery_passes), \
@@ -472,11 +489,16 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
- OPT_DEVICE, \
+ OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
+ x(data_allowed, u8, \
+ OPT_DEVICE, \
+ OPT_BITFIELD(__bch2_data_types), \
+ BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+ "types", "Allowed data types for this device: journal, btree, and/or user")\
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -484,6 +506,11 @@ enum fsck_err_opts {
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
+#define BCH_DEV_OPT_SETTERS() \
+ x(discard, BCH_MEMBER_DISCARD) \
+ x(durability, BCH_MEMBER_DURABILITY) \
+ x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
+
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@@ -563,8 +590,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
-void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+
+struct bch_dev;
+void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
@@ -576,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
const struct bch_option *, u64, unsigned);
+void bch2_opts_to_text(struct printbuf *,
+ struct bch_opts,
+ struct bch_fs *, struct bch_sb *,
+ unsigned, unsigned, unsigned);
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
new file mode 100644
index 000000000000..40a20192eee8
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.c
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i) \
+ for (typeof(&(_a)[0]) _i = _a; \
+ _i < (_a) + ARRAY_SIZE(_a); \
+ _i++)
+
+enum rcu_pending_special {
+ RCU_PENDING_KVFREE = 1,
+ RCU_PENDING_CALL_RCU = 2,
+};
+
+#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? get_state_synchronize_srcu(ssp)
+ : get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+ return ssp
+ ? start_poll_synchronize_srcu(ssp)
+ : start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ return ssp
+ ? poll_state_synchronize_srcu(ssp, cookie)
+ : poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+ return ssp
+ ? srcu_barrier(ssp)
+ : rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func)
+{
+ if (ssp)
+ call_srcu(ssp, rhp, func);
+ else
+ call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+ /*
+ * We're using a radix tree like a vector - we're just pushing elements
+ * onto the end; we're using a radix tree instead of an actual vector to
+ * avoid reallocation overhead
+ */
+ GENRADIX(struct rcu_head *) objs;
+ size_t nr;
+ struct rcu_head **cursor;
+ unsigned long seq;
+};
+
+struct rcu_pending_list {
+ struct rcu_head *head;
+ struct rcu_head *tail;
+ unsigned long seq;
+};
+
+struct rcu_pending_pcpu {
+ struct rcu_pending *parent;
+ spinlock_t lock;
+ int cpu;
+
+ /*
+ * We can't bound the number of unprocessed gp sequence numbers, and we
+ * can't efficiently merge radix trees for expired grace periods, so we
+ * need darray/vector:
+ */
+ DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+ /* Third entry is for expired objects: */
+ struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+ struct rcu_head cb;
+ bool cb_armed;
+ struct work_struct work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+ if (p->objs.nr)
+ return true;
+
+ static_array_for_each(p->lists, i)
+ if (i->head)
+ return true;
+
+ return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+ struct rcu_pending_list *l2)
+{
+#ifdef __KERNEL__
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next = l2->head;
+#else
+ if (!l1->head)
+ l1->head = l2->head;
+ else
+ l1->tail->next.next = (void *) l2->head;
+#endif
+
+ l1->tail = l2->tail;
+ l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+ struct rcu_head *n)
+{
+#ifdef __KERNEL__
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next = n;
+ l->tail = n;
+ n->next = NULL;
+#else
+ if (!l->head)
+ l->head = n;
+ else
+ l->tail->next.next = (void *) n;
+ l->tail = n;
+ n->next.next = NULL;
+#endif
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+ if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+ rcu_pending_list_merge(expired, i);
+}
+
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+ while (nr--)
+ kfree(*p);
+}
+
+#define local_irq_save(flags) \
+do { \
+ flags = 0; \
+} while (0)
+#endif
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ struct rcu_pending_seq objs = {};
+ struct rcu_head *list = NULL;
+
+ if (p->objs.nr &&
+ __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+ objs = p->objs.data[0];
+ darray_remove_item(&p->objs, p->objs.data);
+ }
+
+ merge_expired_lists(p);
+
+ list = expired->head;
+ expired->head = expired->tail = NULL;
+
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ for (size_t i = 0; i < objs.nr; ) {
+ size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+ kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+ i += nr_this_node;
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+
+ /*
+ * low bit of pointer indicates whether rcu_head needs
+ * to be freed - kvfree_rcu_mightsleep()
+ */
+ BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+ void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+ bool free_head = ((unsigned long) obj->func) & 1UL;
+
+ kvfree(ptr);
+ if (free_head)
+ kfree(obj);
+ }
+
+ break;
+
+ case RCU_PENDING_CALL_RCU:
+ for (size_t i = 0; i < objs.nr; i++) {
+ struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+ obj->func(obj);
+ }
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ obj->func(obj);
+ }
+ break;
+
+ default:
+ for (size_t i = 0; i < objs.nr; i++)
+ pending->process(pending, *genradix_ptr(&objs.objs, i));
+ genradix_free(&objs.objs);
+
+ while (list) {
+ struct rcu_head *obj = list;
+#ifdef __KERNEL__
+ list = obj->next;
+#else
+ list = (void *) obj->next.next;
+#endif
+ pending->process(pending, obj);
+ }
+ break;
+ }
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+ struct rcu_pending_pcpu *p,
+ unsigned long flags)
+{
+ /*
+ * XXX: we should grab the gp seq once and avoid multiple function
+ * calls, this is called from __rcu_pending_enqueue() fastpath in
+ * may_sleep==true mode
+ */
+ if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+ (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+ (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+ p->lists[2].head) {
+ __process_finished_items(pending, p, flags);
+ return true;
+ }
+
+ return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+ struct rcu_pending_pcpu *p =
+ container_of(work, struct rcu_pending_pcpu, work);
+ struct rcu_pending *pending = p->parent;
+ unsigned long flags;
+
+ do {
+ spin_lock_irqsave(&p->lock, flags);
+ } while (process_finished_items(pending, p, flags));
+
+ spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+ struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+ schedule_work_on(p->cpu, &p->work);
+
+ unsigned long flags;
+ spin_lock_irqsave(&p->lock, flags);
+ if (__rcu_pending_has_pending(p)) {
+ spin_unlock_irqrestore(&p->lock, flags);
+ __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ p->cb_armed = false;
+ spin_unlock_irqrestore(&p->lock, flags);
+ }
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+{
+ darray_for_each_reverse(p->objs, objs)
+ if (objs->seq == seq)
+ return objs;
+
+ if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+ return NULL;
+
+ return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+ struct rcu_head *head, void *ptr,
+ unsigned long *flags)
+{
+ if (ptr) {
+ if (!head) {
+ /*
+ * kvfree_rcu_mightsleep(): we weren't passed an
+ * rcu_head, but we need one: use the low bit of the
+ * ponter to free to flag that the head needs to be
+ * freed as well:
+ */
+ ptr = (void *)(((unsigned long) ptr)|1UL);
+ head = kmalloc(sizeof(*head), __GFP_NOWARN);
+ if (!head) {
+ spin_unlock_irqrestore(&p->lock, *flags);
+ head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+ /*
+ * dropped lock, did GFP_KERNEL allocation,
+ * check for gp expiration
+ */
+ if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+ kvfree(--ptr);
+ kfree(head);
+ spin_lock_irqsave(&p->lock, *flags);
+ return false;
+ }
+ }
+ }
+
+ head->func = ptr;
+ }
+again:
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (i->seq == seq) {
+ rcu_pending_list_add(i, head);
+ return false;
+ }
+ }
+
+ for (struct rcu_pending_list *i = p->lists;
+ i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+ if (!i->head) {
+ i->seq = seq;
+ rcu_pending_list_add(i, head);
+ return true;
+ }
+ }
+
+ merge_expired_lists(p);
+ goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ * process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ * expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+ void *ptr, bool may_sleep)
+{
+
+ struct rcu_pending_pcpu *p;
+ struct rcu_pending_seq *objs;
+ struct genradix_node *new_node = NULL;
+ unsigned long seq, flags;
+ bool start_gp = false;
+
+ BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+ if (may_sleep &&
+ unlikely(process_finished_items(pending, p, flags)))
+ goto check_expired;
+
+ /*
+ * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+ * that we can do kfree_bulk() - vmalloc pointers always use the linked
+ * list:
+ */
+ if (ptr && unlikely(is_vmalloc_addr(ptr)))
+ goto list_add;
+
+ objs = get_object_radix(p, seq);
+ if (unlikely(!objs))
+ goto list_add;
+
+ if (unlikely(!objs->cursor)) {
+ /*
+ * New radix tree nodes must be added under @p->lock because the
+ * tree root is in a darray that can be resized (typically,
+ * genradix supports concurrent unlocked allocation of new
+ * nodes) - hence preallocation and the retry loop:
+ */
+ objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+ objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+ if (unlikely(!objs->cursor)) {
+ if (may_sleep) {
+ spin_unlock_irqrestore(&p->lock, flags);
+
+ gfp_t gfp = GFP_KERNEL;
+ if (!head)
+ gfp |= __GFP_NOFAIL;
+
+ new_node = genradix_alloc_node(gfp);
+ if (!new_node)
+ may_sleep = false;
+ goto check_expired;
+ }
+list_add:
+ start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+ goto start_gp;
+ }
+ }
+
+ *objs->cursor++ = ptr ?: head;
+ /* zero cursor if we hit the end of a radix tree node: */
+ if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+ objs->cursor = NULL;
+ start_gp = !objs->nr;
+ objs->nr++;
+start_gp:
+ if (unlikely(start_gp)) {
+ /*
+ * We only have one callback (ideally, we would have one for
+ * every outstanding graceperiod) - so if our callback is
+ * already in flight, we may still have to start a grace period
+ * (since we used get_state() above, not start_poll())
+ */
+ if (!p->cb_armed) {
+ p->cb_armed = true;
+ __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+ } else {
+ __start_poll_synchronize_rcu(pending->srcu);
+ }
+ }
+ spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+ if (new_node)
+ genradix_free_node(new_node);
+ return;
+check_expired:
+ if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+ switch ((ulong) pending->process) {
+ case RCU_PENDING_KVFREE:
+ kvfree(ptr);
+ break;
+ case RCU_PENDING_CALL_RCU:
+ head->func(head);
+ break;
+ default:
+ pending->process(pending, head);
+ break;
+ }
+ goto free_node;
+ }
+
+ local_irq_save(flags);
+ p = this_cpu_ptr(pending->p);
+ spin_lock(&p->lock);
+ goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+ __rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+ struct rcu_head *ret = NULL;
+
+ spin_lock_irq(&p->lock);
+ darray_for_each(p->objs, objs)
+ if (objs->nr) {
+ ret = *genradix_ptr(&objs->objs, --objs->nr);
+ objs->cursor = NULL;
+ if (!objs->nr)
+ genradix_free(&objs->objs);
+ goto out;
+ }
+
+ static_array_for_each(p->lists, i)
+ if (i->head) {
+ ret = i->head;
+#ifdef __KERNEL__
+ i->head = ret->next;
+#else
+ i->head = (void *) ret->next.next;
+#endif
+ if (!i->head)
+ i->tail = NULL;
+ goto out;
+ }
+out:
+ spin_unlock_irq(&p->lock);
+
+ return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+ return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+ struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+ if (ret)
+ return ret;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ spin_lock_irq(&p->lock);
+ if (__rcu_pending_has_pending(p) || p->cb_armed) {
+ spin_unlock_irq(&p->lock);
+ return true;
+ }
+ spin_unlock_irq(&p->lock);
+ }
+
+ return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+ int cpu;
+
+ if (!pending->p)
+ return;
+
+ while (rcu_pending_has_pending_or_armed(pending)) {
+ __rcu_barrier(pending->srcu);
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ flush_work(&p->work);
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+ static_array_for_each(p->lists, i)
+ WARN_ON(i->head);
+ WARN_ON(p->objs.nr);
+ darray_exit(&p->objs);
+ }
+ free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending: Object to init
+ * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
+ * RCU flavor
+ * @process: Callback function invoked on objects once their RCU barriers
+ * have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process)
+{
+ pending->p = alloc_percpu(struct rcu_pending_pcpu);
+ if (!pending->p)
+ return -ENOMEM;
+
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+ p->parent = pending;
+ p->cpu = cpu;
+ spin_lock_init(&p->lock);
+ darray_init(&p->objs);
+ INIT_WORK(&p->work, rcu_pending_work);
+ }
+
+ pending->srcu = srcu;
+ pending->process = process;
+
+ return 0;
+}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
new file mode 100644
index 000000000000..71a2f4ddaade
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+#include <linux/rcupdate.h>
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+ struct rcu_pending_pcpu __percpu *p;
+ struct srcu_struct *srcu;
+ rcu_pending_process_fn process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+ struct srcu_struct *srcu,
+ rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index cf81e5128c3a..2d299a37cf07 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -13,6 +13,7 @@
#include "errcode.h"
#include "error.h"
#include "inode.h"
+#include "io_write.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
@@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
if (!data_opts->rewrite_ptrs) {
/*
@@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
+ data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
return data_opts->rewrite_ptrs != 0;
}
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 36de1c6fe8c3..6db72d3bad7d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
@@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
+ if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
ret = 0;
goto out;
}
@@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c)
"error reading btree root %s l=%u: %s",
bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
if (btree_id_is_alloc(i)) {
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
r->error = 0;
- } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+ } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
bch_info(c, "will run btree node scan");
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
}
ret = 0;
@@ -706,17 +706,19 @@ int bch2_fs_recovery(struct bch_fs *c)
if (check_version_upgrade(c))
write_sb = true;
- c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
if (write_sb)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
+ if (c->sb.clean)
+ set_bit(BCH_FS_clean_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -862,6 +864,9 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* in case we don't run journal replay, i.e. norecovery mode */
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
/* fsync if we fixed errors */
if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 73339a0a3111..735b8adc8f9d 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+ if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
return bch2_fs_read_write_early(c);
return 0;
}
@@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v)
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
enum bch_recovery_pass pass)
{
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return 0;
bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
bch2_recovery_passes[pass], pass,
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
- c->recovery_passes_explicit |= BIT_ULL(pass);
+ c->opts.recovery_passes |= BIT_ULL(pass);
if (c->curr_recovery_pass >= pass) {
c->curr_recovery_pass = pass;
@@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
{
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- if (c->recovery_passes_explicit & BIT_ULL(pass))
+ if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
+ return false;
+ if (c->opts.recovery_passes & BIT_ULL(pass))
return true;
if ((p->when & PASS_FSCK) && c->opts.fsck)
return true;
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 8c7dee5983d2..50406ce0e4ef 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -50,7 +50,7 @@
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e59c0abb4772..f457925fa362 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.version = orig->k.version;
+ r_v->k.bversion = orig->k.bversion;
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 1f34c92a6d11..bcb3276747e0 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
+static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
@@ -94,6 +94,16 @@ bad:
return -BCH_ERR_invalid_replicas_entry;
}
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_fs *c,
+ struct printbuf *err)
+{
+ mutex_lock(&c->sb_lock);
+ int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
@@ -123,7 +133,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
- r->devs[r->nr_devs++] = p.ptr.dev;
+ replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@@ -140,7 +150,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
- r->devs[r->nr_devs++] = ptr->dev;
+ replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@@ -181,7 +191,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
- e->devs[e->nr_devs++] = *i;
+ replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}
@@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate(e, sb, err);
+ int ret = bch2_replicas_entry_validate_locked(e, sb, err);
if (ret)
return ret;
@@ -795,12 +805,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
for (unsigned i = 0; i < e->nr_devs; i++) {
nr_online += test_bit(e->devs[i], devs.d);
- struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
}
rcu_read_unlock();
- if (nr_failed == e->nr_devs)
+ if (nr_online + nr_failed == e->nr_devs)
continue;
if (nr_online < e->nr_required)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 622482559c3d..5aba2c1ce133 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_sb *, struct printbuf *);
+ struct bch_fs *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 *
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b97208195d06..b7eff904acdb 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -5,7 +5,7 @@
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas_v0 {
@@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[];
+ __u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas {
@@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
+#define replicas_entry_add_dev(e, d) ({ \
+ (e)->nr_devs++; \
+ (e)->devs[(e)->nr_devs - 1] = (d); \
+})
+
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index c57d42bb8d1b..005275281804 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -155,7 +155,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
- return NULL;
+ return ERR_PTR(-BCH_ERR_invalid_sb_clean);
}
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
@@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
+ kfree(clean);
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index c7e4cdd3f6a5..5102059a0f1d 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
if (!first)
prt_char(out, ',');
first = false;
- unsigned e = le16_to_cpu(i->errors[j]);
- prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+ bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
}
prt_newline(out);
}
@@ -353,7 +352,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]);
- downgrade_table_extra(c, &table);
+ ret = downgrade_table_extra(c, &table);
+ if (ret)
+ goto out;
if (!dst->recovery_passes[0] &&
!dst->recovery_passes[1] &&
@@ -399,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_SB_ERR_MAX)
+ if (e < BCH_FSCK_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
__set_bit_le64(e, ext->errors_silent);
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index c1270d790e43..013a96883b4e 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -7,12 +7,12 @@
const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
- NULL
+#undef x
};
-static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
- if (id < BCH_SB_ERR_MAX)
+ if (id < BCH_FSCK_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index 8889001e7db4..b2357b8e6107 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -6,6 +6,8 @@
extern const char * const bch2_sb_error_strs[];
+void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index f0c14702f9e6..ed5dca5e1161 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -210,22 +210,23 @@ enum bch_fsck_flags {
x(inode_snapshot_mismatch, 196, 0) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_unlinked_and_not_open, 281, 0) \
x(inode_checksum_type_invalid, 199, 0) \
x(inode_compression_type_invalid, 200, 0) \
x(inode_subvol_root_but_not_dir, 201, 0) \
- x(inode_i_size_dirty_but_clean, 202, 0) \
- x(inode_i_sectors_dirty_but_clean, 203, 0) \
- x(inode_i_sectors_wrong, 204, 0) \
- x(inode_dir_wrong_nlink, 205, 0) \
- x(inode_dir_multiple_links, 206, 0) \
- x(inode_multiple_links_but_nlink_0, 207, 0) \
- x(inode_wrong_backpointer, 208, 0) \
- x(inode_wrong_nlink, 209, 0) \
- x(inode_unreachable, 210, 0) \
- x(deleted_inode_but_clean, 211, 0) \
- x(deleted_inode_missing, 212, 0) \
- x(deleted_inode_is_dir, 213, 0) \
- x(deleted_inode_not_unlinked, 214, 0) \
+ x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
+ x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
+ x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
+ x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
+ x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
+ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
+ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
+ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_unreachable, 210, FSCK_AUTOFIX) \
+ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
+ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
+ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
+ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \
@@ -255,7 +256,7 @@ enum bch_fsck_flags {
x(dir_loop, 241, 0) \
x(hash_table_key_duplicate, 242, 0) \
x(hash_table_key_wrong_offset, 243, 0) \
- x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
@@ -270,7 +271,7 @@ enum bch_fsck_flags {
x(subvol_children_not_set, 256, 0) \
x(subvol_children_bad, 257, 0) \
x(subvol_loop, 258, 0) \
- x(subvol_unreachable, 259, 0) \
+ x(subvol_unreachable, 259, FSCK_AUTOFIX) \
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
@@ -282,8 +283,8 @@ enum bch_fsck_flags {
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \
- x(alloc_key_stripe_sectors_wrong, 271, 0) \
- x(accounting_mismatch, 272, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
+ x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
@@ -292,12 +293,14 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
+ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
+ x(MAX, 284, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
- BCH_SB_ERR_MAX
};
struct bch_sb_field_errors {
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 4b765422dd77..02bcde3c1b02 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -465,3 +465,60 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
}
}
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+ unsigned nr = 0;
+
+ for (unsigned i = 0; i < sb->nr_devices; i++)
+ nr += bch2_member_exists((struct bch_sb *) sb, i);
+ return nr;
+}
+
+int bch2_sb_member_alloc(struct bch_fs *c)
+{
+ unsigned dev_idx = c->sb.nr_devices;
+ struct bch_sb_field_members_v2 *mi;
+ unsigned nr_devices;
+ unsigned u64s;
+ int best = -1;
+ u64 best_last_mount = 0;
+
+ if (dev_idx < BCH_SB_MEMBERS_MAX)
+ goto have_slot;
+
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
+ /* eventually BCH_SB_MEMBERS_MAX will be raised */
+ if (dev_idx == BCH_SB_MEMBER_INVALID)
+ continue;
+
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
+ if (bch2_member_alive(&m))
+ continue;
+
+ u64 last_mount = le64_to_cpu(m.last_mount);
+ if (best < 0 || last_mount < best_last_mount) {
+ best = dev_idx;
+ best_last_mount = last_mount;
+ }
+ }
+ if (best >= 0) {
+ dev_idx = best;
+ goto have_slot;
+ }
+
+ return -BCH_ERR_ENOSPC_sb_members;
+have_slot:
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+ if (!mi)
+ return -BCH_ERR_ENOSPC_sb_members;
+
+ c->disk_sb.sb->nr_devices = nr_devices;
+ return dev_idx;
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index dd93192ec065..762083b564ee 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev)
lockdep_is_held(&c->state_lock));
}
-static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev)
{
return c && dev < c->sb.nr_devices
? rcu_dereference(c->devs[dev])
: NULL;
}
+void bch2_dev_missing(struct bch_fs *, unsigned);
+
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
+{
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
+ if (unlikely(!ca))
+ bch2_dev_missing(c, dev);
+ return ca;
+}
+
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
rcu_read_unlock();
return ca;
}
-void bch2_dev_missing(struct bch_fs *, unsigned);
-
static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
- if (!ca)
+ if (unlikely(!ca))
bch2_dev_missing(c, dev);
return ca;
}
@@ -307,6 +315,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
return false;
}
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
@@ -352,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+int bch2_sb_member_alloc(struct bch_fs *);
+
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 3a494c5d1247..617d07e53b20 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
ret = -1 - SIX_LOCK_write;
}
} else if (type == SIX_LOCK_write && lock->readers) {
- if (try) {
+ if (try)
atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
+ /*
+ * Make sure atomic_add happens before pcpu_read_count and
+ * six_set_bitmask in slow path happens before pcpu_read_count.
+ *
+ * Paired with the smp_mb() in read lock fast path (per-cpu mode)
+ * and the one before atomic_read in read unlock path.
+ */
+ smp_mb();
ret = !pcpu_read_count(lock);
if (try && !ret) {
@@ -335,7 +341,7 @@ static inline bool six_owner_running(struct six_lock *lock)
*/
rcu_read_lock();
struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
+ bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
rcu_read_unlock();
return ret;
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 8b18a9b483a4..1809442b00ee 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 id = snapshot_root;
u32 subvol = 0, s;
+ rcu_read_lock();
while (id) {
s = snapshot_t(c, id)->subvol;
@@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
id = bch2_snapshot_tree_next(c, id);
}
+ rcu_read_unlock();
return subvol;
}
@@ -1782,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
new->k.p.snapshot = leaf_id;
ret = bch2_trans_update(trans, &iter, new, 0);
out:
+ bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index c8c266cb5797..215eed4cce6d 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
POS(insert->k.p.inode, U64_MAX),
- BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
+ BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index dbe834cb349f..6845dde1b339 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans,
}
struct bch_inode_unpacked inode;
- struct btree_iter inode_iter = {};
- ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
- 0);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret,
- trans, subvol_to_missing_root,
- "subvolume %llu points to missing subvolume root %llu:%u",
- k.k->p.offset, le64_to_cpu(subvol.v->inode),
- le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- return ret ?: -BCH_ERR_transaction_restart_nested;
- }
-
- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- trans, subvol_root_wrong_bi_subvol,
- "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
- inode.bi_inum, inode_iter.k.p.snapshot,
- inode.bi_subvol, subvol.k->p.offset)) {
- inode.bi_subvol = subvol.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
- if (ret)
+ &inode);
+ if (!ret) {
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ trans, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode.bi_snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+ } else if (bch2_err_matches(ret, ENOENT)) {
+ if (fsck_err(trans, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ ret = ret ?: -BCH_ERR_transaction_restart_nested;
goto err;
+ }
+ } else {
+ goto err;
}
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
@@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans,
"%s: snapshot tree %u not found", __func__, snapshot_tree);
if (ret)
- return ret;
+ goto err;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
trans, subvol_not_master_and_not_snapshot,
@@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans,
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(s);
if (ret)
- return ret;
+ goto err;
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a8299ba2cab2..e62f876541fe 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+ u32 subvolid, unsigned flags)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ if (ret)
+ return bkey_s_c_err(ret);
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+ return bch2_btree_iter_peek_upto_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct bkey_s_c _k; \
+ int _ret3 = 0; \
+ \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
+ _end, _subvolid, (_flags)); \
+ if (!(_k).k) \
+ break; \
+ \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ \
+ bch2_trans_iter_exit((_trans), &(_iter)); \
+ _ret3; \
+})
+
+#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
+ _start, _end, _subvolid, _flags, _k, _do) \
+({ \
+ struct btree_iter _iter; \
+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ \
+ for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
+ _end, _subvolid, _flags, _k, _do); \
+})
+
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9b10c8947828..f2ec4277c2a5 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -30,7 +30,8 @@ struct snapshot_table {
};
typedef struct {
- u32 subvol;
+ /* we can't have padding in this struct: */
+ u64 subvol;
u64 inum;
} subvol_inum;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c8c2ccbdfbb5..ce7410d72089 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
!BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+ SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
@@ -796,8 +799,10 @@ retry:
i < layout.sb_offset + layout.nr_superblocks; i++) {
offset = le64_to_cpu(*i);
- if (offset == opt_get(*opts, sb))
+ if (offset == opt_get(*opts, sb)) {
+ ret = -BCH_ERR_invalid;
continue;
+ }
ret = read_one_super(sb, offset, &err);
if (!ret)
@@ -1185,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
prt_printf(out, "Errors to silently fix:\t");
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+ prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
+ min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
prt_newline(out);
kfree(errors_silent);
@@ -1292,15 +1298,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- u64 fields_have = 0;
- unsigned nr_devices = 0;
-
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
- for (int i = 0; i < sb->nr_devices; i++)
- nr_devices += bch2_member_exists(sb, i);
-
prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
@@ -1356,9 +1356,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_newline(out);
prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
- prt_printf(out, "Devices:\t%u\n", nr_devices);
+ prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
prt_printf(out, "Sections:\t");
+ u64 fields_have = 0;
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
prt_bitflags(out, bch2_sb_fields, fields_have);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7fa2de35014..873e4be7e1dc 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c)
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
- BUG_ON(atomic_read(&c->btree_cache.dirty));
+ BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
@@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c);
@@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
- c->promote_whole_extents = true;
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
@@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
+ bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c);
@@ -1591,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
/* Device add/removal: */
-static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bpos start = POS(ca->dev_idx, 0);
- struct bpos end = POS(ca->dev_idx, U64_MAX);
- int ret;
-
- /*
- * We clear the LRU and need_discard btrees first so that we don't race
- * with bch2_do_invalidates() and bch2_do_discards()
- */
- ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL) ?:
- bch2_dev_usage_remove(c, ca->dev_idx);
- bch_err_msg(c, ret, "removing dev alloc info");
- return ret;
-}
-
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
@@ -1729,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members_v2 *mi;
- struct bch_member dev_mi;
- unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
int ret;
@@ -1741,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
- dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
+ struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
@@ -1779,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
if (dynamic_fault("bcachefs:add:no_slot"))
- goto no_slot;
-
- if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) {
- dev_idx = c->sb.nr_devices;
- goto have_slot;
- }
-
- int best = -1;
- u64 best_last_mount = 0;
- for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) {
- struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
- continue;
-
- u64 last_mount = le64_to_cpu(m.last_mount);
- if (best < 0 || last_mount < best_last_mount) {
- best = dev_idx;
- best_last_mount = last_mount;
- }
- }
- if (best >= 0) {
- dev_idx = best;
- goto have_slot;
- }
-no_slot:
- ret = -BCH_ERR_ENOSPC_sb_members;
- bch_err_msg(c, ret, "setting up new superblock");
- goto err_unlock;
-
-have_slot:
- nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-
- mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
- le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+ goto err_unlock;
- mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
- if (!mi) {
- ret = -BCH_ERR_ENOSPC_sb_members;
+ ret = bch2_sb_member_alloc(c);
+ if (ret < 0) {
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ unsigned dev_idx = ret;
/* success: */
- *m = dev_mi;
- m->last_mount = cpu_to_le64(ktime_get_real_seconds());
- c->disk_sb.sb->nr_devices = nr_devices;
+ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds());
+ *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi;
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 33f2a64c14c9..03e59f86f360 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
-rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -234,7 +233,7 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
- { .name = #_name, .mode = 0444 };
+ { .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
@@ -245,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
+ struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
- mutex_lock(&c->btree_cache.lock);
- list_for_each_entry(b, &c->btree_cache.live, list)
+ mutex_lock(&bc->lock);
+ list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
-
- mutex_unlock(&c->btree_cache.lock);
+ list_for_each_entry(b, &bc->live[1].list, list)
+ ret += btree_buf_bytes(b);
+ list_for_each_entry(b, &bc->freeable, list)
+ ret += btree_buf_bytes(b);
+ mutex_unlock(&bc->lock);
return ret;
}
@@ -288,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_tab_rjust(out);
prt_human_readable_u64(out, nr_extents
- ? div_u64(sectors_uncompressed << 9, nr_extents)
+ ? div64_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
@@ -347,8 +350,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
- sysfs_print(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (attr == &sysfs_journal_debug)
@@ -436,8 +437,6 @@ STORE(bch2_fs)
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
- sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
-
/* Debugging: */
if (!test_bit(BCH_FS_started, &c->flags))
@@ -449,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
+ struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
+ bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {
@@ -514,7 +514,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_cache_size,
&sysfs_btree_write_stats,
- &sysfs_promote_whole_extents,
+ &sysfs_rebalance_status,
&sysfs_compression_stats,
@@ -614,7 +614,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -674,7 +673,7 @@ STORE(bch2_fs_opts_dir)
if (ret < 0)
goto err;
- bch2_opt_set_sb(c, opt, v);
+ bch2_opt_set_sb(c, NULL, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if (v &&
@@ -728,6 +727,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name) \
+ if (attr == &sysfs_time_stat_##name) \
+ bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+ BCH_TIME_STATS()
+#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
@@ -821,32 +827,17 @@ STORE(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- struct bch_member *mi;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v != BCH_MEMBER_DISCARD(mi)) {
- SET_BCH_MEMBER_DISCARD(mi, v);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
}
if (attr == &sysfs_durability) {
u64 v = strtoul_or_return(buf);
- mutex_lock(&c->sb_lock);
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
- if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
- SET_BCH_MEMBER_DURABILITY(mi, v + 1);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
+ bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
}
if (attr == &sysfs_label) {
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 01b768c9b767..b2f209743afe 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start;
- k.k_i.k.version.lo = test_version++;
+ k.k_i.k.bversion.lo = test_version++;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 0807ce9b171a..dea73bc1cb51 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -275,7 +275,6 @@ static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigne
}
static const struct file_operations thread_with_stdio_fops = {
- .llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
@@ -285,7 +284,6 @@ static const struct file_operations thread_with_stdio_fops = {
};
static const struct file_operations thread_with_stdout_fops = {
- .llseek = no_llseek,
.read = thread_with_stdio_read,
.poll = thread_with_stdout_poll,
.flush = thread_with_stdio_flush,
@@ -387,7 +385,7 @@ again:
seen = buf->buf.nr;
char *n = memchr(buf->buf.data, '\n', seen);
- if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+ if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
spin_unlock(&buf->lock);
return -ETIME;
}
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
index 4508e9dcbee2..3fe82757f93a 100644
--- a/fs/bcachefs/time_stats.c
+++ b/fs/bcachefs/time_stats.c
@@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
}
}
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+ spin_lock_irq(&stats->lock);
+ unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+ memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+ if (stats->buffer) {
+ int cpu;
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+ }
+ spin_unlock_irq(&stats->lock);
+}
+
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
index 5df61403744b..dc6493f7bbab 100644
--- a/fs/bcachefs/time_stats.h
+++ b/fs/bcachefs/time_stats.h
@@ -70,6 +70,7 @@ struct time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
+ struct time_stat_buffer __percpu *buffer;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@@ -87,7 +88,6 @@ struct bch2_time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
- struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
@@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
return false;
}
+void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c62f00322d1e..5597b9d6297f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -3,7 +3,6 @@
#define TRACE_SYSTEM bcachefs
#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BCACHEFS_H
#include <linux/tracepoint.h>
@@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__array(char, node, 24 )
__field(u8, self_read_count )
@@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail,
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
- __entry->level = path->level;
+ __entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
@@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail,
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.n[SIX_LOCK_read];
__entry->intent_count = c.n[SIX_LOCK_intent];
- scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
}
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
@@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
+ __field(u8, path_idx)
TRACE_BPOS_entries(pos)
__field(u8, locked )
__field(u8, self_read_count )
@@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = level;
+ __entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
@@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+ TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
+ __entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast,
TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
);
+#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
+
+TRACE_EVENT(update_by_path,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite),
+ TP_ARGS(trans, path, i, overwrite),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, path_idx )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ __field(u8, overwrite )
+ __field(btree_path_idx_t, update_idx )
+ __field(btree_path_idx_t, nr_updates )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->path_idx = path - trans->paths;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ __entry->overwrite = overwrite;
+ __entry->update_idx = i - trans->updates;
+ __entry->nr_updates = trans->nr_updates;
+ ),
+
+ TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
+ __entry->trans_fn,
+ __entry->path_idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->overwrite,
+ __entry->update_idx,
+ __entry->nr_updates)
+);
+
+TRACE_EVENT(btree_path_lock,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_bkey_cached_common *b),
+ TP_ARGS(trans, caller_ip, b),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u8, level )
+ __array(char, node, 24 )
+ __field(u32, lock_seq )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = b->btree_id;
+ __entry->level = b->level;
+
+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ __entry->lock_seq = six_lock_seq(&b->lock);
+ ),
+
+ TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->level,
+ __entry->node,
+ __entry->lock_seq)
+);
+
+DECLARE_EVENT_CLASS(btree_path_ev,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(u16, idx )
+ __field(u8, ref )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
+ __entry->idx, __entry->ref,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_alloc,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
+ __entry->idx,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
+TRACE_EVENT(btree_path_get,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, locks_want )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->locks_want = path->locks_want;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->locks_want,
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, new_idx )
+ __field(u8, btree_id )
+ __field(u8, ref )
+ __field(u8, preserve )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->new_idx = new - trans->paths;
+ __entry->btree_id = path->btree_id;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->new_idx)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_clone,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+ TP_ARGS(trans, path, new)
+);
+
+DECLARE_EVENT_CLASS(btree_path_traverse,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, should_be_locked )
+ __field(u8, btree_id )
+ __field(u8, level )
+ TRACE_BPOS_entries(pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ __entry->level = path->level;
+ TRACE_BPOS_assign(pos, path->pos);
+
+ __entry->locks_want = path->locks_want;
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->trans_fn,
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->level,
+ __entry->locks_want,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
+ TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+ TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_set_pos,
+ TP_PROTO(struct btree_trans *trans,
+ struct btree_path *path,
+ struct bpos *new_pos),
+ TP_ARGS(trans, path, new_pos),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, ref )
+ __field(u8, preserve )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(old_pos)
+ TRACE_BPOS_entries(new_pos)
+ __field(u8, locks_want )
+ __field(u8, nodes_locked )
+ __array(char, node0, 24 )
+ __array(char, node1, 24 )
+ __array(char, node2, 24 )
+ __array(char, node3, 24 )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path - trans->paths;
+ __entry->ref = path->ref;
+ __entry->preserve = path->preserve;
+ __entry->btree_id = path->btree_id;
+ TRACE_BPOS_assign(old_pos, path->pos);
+ TRACE_BPOS_assign(new_pos, *new_pos);
+
+ __entry->nodes_locked = path->nodes_locked;
+ struct btree *b = path->l[0].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[1].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[2].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+ b = path->l[3].b;
+ if (IS_ERR(b))
+ strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+ else
+ scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+ ),
+
+ TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
+ "locks %u %u %u %u node %s %s %s %s",
+ __entry->idx,
+ __entry->ref,
+ __entry->preserve,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->old_pos_inode,
+ __entry->old_pos_offset,
+ __entry->old_pos_snapshot,
+ __entry->new_pos_inode,
+ __entry->new_pos_offset,
+ __entry->new_pos_snapshot,
+ (__entry->nodes_locked >> 6) & 3,
+ (__entry->nodes_locked >> 4) & 3,
+ (__entry->nodes_locked >> 2) & 3,
+ (__entry->nodes_locked >> 0) & 3,
+ __entry->node3,
+ __entry->node2,
+ __entry->node1,
+ __entry->node0)
+);
+
+TRACE_EVENT(btree_path_free,
+ TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
+ TP_ARGS(trans, path, dup),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ __field(u8, preserve )
+ __field(u8, should_be_locked)
+ __field(s8, dup )
+ __field(u8, dup_locked )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ __entry->preserve = trans->paths[path].preserve;
+ __entry->should_be_locked = trans->paths[path].should_be_locked;
+ __entry->dup = dup ? dup - trans->paths : -1;
+ __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
+ ),
+
+ TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
+ __entry->preserve ? 'P' : ' ',
+ __entry->should_be_locked ? 'S' : ' ',
+ __entry->dup,
+ __entry->dup_locked)
+);
+
+TRACE_EVENT(btree_path_free_trans_begin,
+ TP_PROTO(btree_path_idx_t path),
+ TP_ARGS(path),
+
+ TP_STRUCT__entry(
+ __field(btree_path_idx_t, idx )
+ ),
+
+ TP_fast_assign(
+ __entry->idx = path;
+ ),
+
+ TP_printk(" path %3u", __entry->idx)
+);
+
+#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+#ifndef _TRACE_BCACHEFS_H
+
+static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct btree_insert_entry *i, bool overwrite) {}
+static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
+static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
+static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
+
+#endif
+#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+
+#define _TRACE_BCACHEFS_H
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 1b8554460af4..42f565c76181 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res)
*res = 1;
while (p--) {
- if (*res > div_u64(U64_MAX, n))
+ if (*res > div64_u64(U64_MAX, n))
return -ERANGE;
*res *= n;
}
@@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res)
parse_or_ret(cp, parse_unit_suffix(cp, &b));
- if (v > div_u64(U64_MAX, b))
+ if (v > div64_u64(U64_MAX, b))
return -ERANGE;
v *= b;
- if (f_n > div_u64(U64_MAX, b))
+ if (f_n > div64_u64(U64_MAX, b))
return -ERANGE;
- f_n = div_u64(f_n * b, f_d);
+ f_n = div64_u64(f_n * b, f_d);
if (v + f_n < v)
return -ERANGE;
v += f_n;
@@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
-u64 bch2_read_flag_list(char *opt, const char * const list[])
+u64 bch2_read_flag_list(const char *opt, const char * const list[])
{
u64 ret = 0;
char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
@@ -214,7 +214,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
s = strim(d);
- while ((p = strsep(&s, ","))) {
+ while ((p = strsep(&s, ",;"))) {
int flag = match_string(list, -1, p);
if (flag < 0) {
@@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = bch2_pick_time_units(ns);
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+ prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name);
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
@@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
u64 q = max(quantiles->entries[i].m, last_q);
- prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+ prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 902b7f5406a2..fb02c1c36004 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
bool bch2_is_zero(const void *, size_t);
-u64 bch2_read_flag_list(char *, const char * const[]);
+u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 331f944d73dc..56c8d3fe55a4 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
return 0;
}
+static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
+{
+ const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
+
+ if (!xattr_handler_can_list(handler, dentry))
+ return NULL;
+
+ return xattr_prefix(handler);
+}
+
static int bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
struct xattr_buf *buf)
{
- const struct xattr_handler *handler =
- bch2_xattr_type_to_handler(xattr->x_type);
+ const char *prefix;
+
+ prefix = bch2_xattr_prefix(xattr->x_type, dentry);
+ if (!prefix)
+ return 0;
- return handler && (!handler->list || handler->list(dentry))
- ? __bch2_xattr_emit(handler->prefix ?: handler->name,
- xattr->x_name, xattr->x_name_len, buf)
- : 0;
+ return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@@ -295,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
- u32 snapshot;
- int ret;
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
- SPOS(inum, offset, snapshot),
- POS(inum, U64_MAX), 0, k, ret) {
- if (k.k->type != KEY_TYPE_xattr)
- continue;
-
- ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
- if (ret)
- break;
- }
- offset = iter.pos.offset;
- bch2_trans_iter_exit(trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- bch2_trans_put(trans);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+ POS(inum, offset),
+ POS(inum, U64_MAX),
+ inode->ei_inum.subvol, 0, k, ({
+ if (k.k->type != KEY_TYPE_xattr)
+ continue;
- if (ret)
- goto out;
+ bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+ }))) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+ bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
- if (ret)
- goto out;
-
- ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
- if (ret)
- goto out;
-
- return buf.used;
-out:
- return bch2_err_class(ret);
+ return ret ? bch2_err_class(ret) : buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@@ -632,10 +611,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
- &nop_posix_acl_access,
- &nop_posix_acl_default,
-#endif
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index e9f810539552..c7916011ef34 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
- __u8 x_name[];
+ __u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 34d0d1e43f36..06dc4a57ba78 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2032,10 +2032,8 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm)) {
- coredump_report_failure("Error collecting note info");
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
- }
has_dumped = 1;
@@ -2050,10 +2048,8 @@ static int elf_core_dump(struct coredump_params *cprm)
sz += elf_coredump_extra_notes_size();
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
- if (!phdr4note) {
- coredump_report_failure("Error allocating program headers note entry");
+ if (!phdr4note)
goto end_coredump;
- }
fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
@@ -2067,24 +2063,18 @@ static int elf_core_dump(struct coredump_params *cprm)
if (e_phnum == PN_XNUM) {
shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
- if (!shdr4extnum) {
- coredump_report_failure("Error allocating extra program headers");
+ if (!shdr4extnum)
goto end_coredump;
- }
fill_extnum_info(&elf, shdr4extnum, e_shoff, segs);
}
offset = dataoff;
- if (!dump_emit(cprm, &elf, sizeof(elf))) {
- coredump_report_failure("Error emitting the ELF headers");
+ if (!dump_emit(cprm, &elf, sizeof(elf)))
goto end_coredump;
- }
- if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) {
- coredump_report_failure("Error emitting the program header for notes");
+ if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
goto end_coredump;
- }
/* Write program headers for segments dump */
for (i = 0; i < cprm->vma_count; i++) {
@@ -2107,28 +2097,20 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- if (!dump_emit(cprm, &phdr, sizeof(phdr))) {
- coredump_report_failure("Error emitting program headers");
+ if (!dump_emit(cprm, &phdr, sizeof(phdr)))
goto end_coredump;
- }
}
- if (!elf_core_write_extra_phdrs(cprm, offset)) {
- coredump_report_failure("Error writing out extra program headers");
+ if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- }
/* write out the notes section */
- if (!write_note_info(&info, cprm)) {
- coredump_report_failure("Error writing out notes");
+ if (!write_note_info(&info, cprm))
goto end_coredump;
- }
/* For cell spufs and x86 xstate */
- if (elf_coredump_extra_notes_write(cprm)) {
- coredump_report_failure("Error writing out extra notes");
+ if (elf_coredump_extra_notes_write(cprm))
goto end_coredump;
- }
/* Align to page */
dump_skip_to(cprm, dataoff);
@@ -2136,22 +2118,16 @@ static int elf_core_dump(struct coredump_params *cprm)
for (i = 0; i < cprm->vma_count; i++) {
struct core_vma_metadata *meta = cprm->vma_meta + i;
- if (!dump_user_range(cprm, meta->start, meta->dump_size)) {
- coredump_report_failure("Error writing out the process memory");
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
- }
}
- if (!elf_core_write_extra_data(cprm)) {
- coredump_report_failure("Error writing out extra data");
+ if (!elf_core_write_extra_data(cprm))
goto end_coredump;
- }
if (e_phnum == PN_XNUM) {
- if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum))) {
- coredump_report_failure("Error emitting extra program headers");
+ if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
goto end_coredump;
- }
}
end_coredump:
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
new file mode 100644
index 000000000000..3fe9f59ef867
--- /dev/null
+++ b/fs/bpf_fs_kfuncs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Google LLC. */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_task_exe_file - get a reference on the exe_file struct file member of
+ * the mm_struct that is nested within the supplied
+ * task_struct
+ * @task: task_struct of which the nested mm_struct exe_file member to get a
+ * reference on
+ *
+ * Get a reference on the exe_file struct file member field of the mm_struct
+ * nested within the supplied *task*. The referenced file pointer acquired by
+ * this BPF kfunc must be released using bpf_put_file(). Failing to call
+ * bpf_put_file() on the returned referenced struct file pointer that has been
+ * acquired by this BPF kfunc will result in the BPF program being rejected by
+ * the BPF verifier.
+ *
+ * This BPF kfunc may only be called from BPF LSM programs.
+ *
+ * Internally, this BPF kfunc leans on get_task_exe_file(), such that calling
+ * bpf_get_task_exe_file() would be analogous to calling get_task_exe_file()
+ * directly in kernel context.
+ *
+ * Return: A referenced struct file pointer to the exe_file member of the
+ * mm_struct that is nested within the supplied *task*. On error, NULL is
+ * returned.
+ */
+__bpf_kfunc struct file *bpf_get_task_exe_file(struct task_struct *task)
+{
+ return get_task_exe_file(task);
+}
+
+/**
+ * bpf_put_file - put a reference on the supplied file
+ * @file: file to put a reference on
+ *
+ * Put a reference on the supplied *file*. Only referenced file pointers may be
+ * passed to this BPF kfunc. Attempting to pass an unreferenced file pointer, or
+ * any other arbitrary pointer for that matter, will result in the BPF program
+ * being rejected by the BPF verifier.
+ *
+ * This BPF kfunc may only be called from BPF LSM programs.
+ */
+__bpf_kfunc void bpf_put_file(struct file *file)
+{
+ fput(file);
+}
+
+/**
+ * bpf_path_d_path - resolve the pathname for the supplied path
+ * @path: path to resolve the pathname for
+ * @buf: buffer to return the resolved pathname in
+ * @buf__sz: length of the supplied buffer
+ *
+ * Resolve the pathname for the supplied *path* and store it in *buf*. This BPF
+ * kfunc is the safer variant of the legacy bpf_d_path() helper and should be
+ * used in place of bpf_d_path() whenever possible. It enforces KF_TRUSTED_ARGS
+ * semantics, meaning that the supplied *path* must itself hold a valid
+ * reference, or else the BPF program will be outright rejected by the BPF
+ * verifier.
+ *
+ * This BPF kfunc may only be called from BPF LSM programs.
+ *
+ * Return: A positive integer corresponding to the length of the resolved
+ * pathname in *buf*, including the NUL termination character. On error, a
+ * negative integer is returned.
+ */
+__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
+{
+ int len;
+ char *ret;
+
+ if (!buf__sz)
+ return -EINVAL;
+
+ ret = d_path(path, buf, buf__sz);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ len = buf + buf__sz - ret;
+ memmove(buf, ret, len);
+ return len;
+}
+
+/**
+ * bpf_get_dentry_xattr - get xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *dentry* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ struct inode *inode = d_inode(dentry);
+ u32 value_len;
+ void *value;
+ int ret;
+
+ if (WARN_ON(!inode))
+ return -EINVAL;
+
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+ if (ret)
+ return ret;
+ return __vfs_getxattr(dentry, inode, name__str, value, value_len);
+}
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct dentry *dentry;
+
+ dentry = file_dentry(file);
+ return bpf_get_dentry_xattr(dentry, name__str, value_p);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_task_exe_file,
+ KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
+
+static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&bpf_fs_kfunc_set_ids, kfunc_id) ||
+ prog->type == BPF_PROG_TYPE_LSM)
+ return 0;
+ return -EACCES;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_fs_kfunc_set_ids,
+ .filter = bpf_fs_kfuncs_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9a4b7c119318..e152fde888fc 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -152,6 +152,7 @@ struct btrfs_inode {
* logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
* defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
* update the VFS' inode number of bytes used.
+ * Also protects setting struct file::private_data.
*/
spinlock_t lock;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a44fb9845e3..317a3712270f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -463,6 +463,8 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
+ /* Task that allocated this structure. */
+ struct task_struct *owner_task;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index acf1f39e45d0..b95ef44c326b 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -213,6 +213,8 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
&fs_info->defrag_inodes, rb_node)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ fs_info->defrag_inodes = RB_ROOT;
+
spin_unlock(&fs_info->defrag_inodes_lock);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c5e36f58eb07..4fb521d91b06 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3485,7 +3485,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
{
struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
- struct btrfs_file_private *private = file->private_data;
+ struct btrfs_file_private *private;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *cached_state = NULL;
struct extent_state **delalloc_cached_state;
@@ -3513,7 +3513,19 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
inode_get_bytes(&inode->vfs_inode) == i_size)
return i_size;
- if (!private) {
+ spin_lock(&inode->lock);
+ private = file->private_data;
+ spin_unlock(&inode->lock);
+
+ if (private && private->owner_task != current) {
+ /*
+ * Not allocated by us, don't use it as its cached state is used
+ * by the task that allocated it and we don't want neither to
+ * mess with it nor get incorrect results because it reflects an
+ * invalid state for the current task.
+ */
+ private = NULL;
+ } else if (!private) {
private = kzalloc(sizeof(*private), GFP_KERNEL);
/*
* No worries if memory allocation failed.
@@ -3521,7 +3533,23 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
* lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
* so everything will still be correct.
*/
- file->private_data = private;
+ if (private) {
+ bool free = false;
+
+ private->owner_task = current;
+
+ spin_lock(&inode->lock);
+ if (file->private_data)
+ free = true;
+ else
+ file->private_data = private;
+ spin_unlock(&inode->lock);
+
+ if (free) {
+ kfree(private);
+ private = NULL;
+ }
+ }
}
if (private)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8537eb9b5531..226c91fe31a7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1310,12 +1310,12 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
- if (!src.file) {
+ if (!fd_file(src)) {
ret = -EINVAL;
goto out_drop_write;
}
- src_inode = file_inode(src.file);
+ src_inode = file_inode(fd_file(src));
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 634d69964fe4..7b50263723bc 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1517,7 +1517,7 @@ static int check_extent_item(struct extent_buffer *leaf,
dref_objectid > BTRFS_LAST_FREE_OBJECTID)) {
extent_err(leaf, slot,
"invalid data ref objectid value %llu",
- dref_root);
+ dref_objectid);
return -EUCLEAN;
}
if (unlikely(!IS_ALIGNED(dref_offset,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f53977169db4..2b3f9935dbb4 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -595,14 +595,12 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
* write and readdir but not lookup or open).
*/
touch_atime(&file->f_path);
- dput(dentry);
return true;
check_failed:
fscache_cookie_lookup_negative(object->cookie);
cachefiles_unmark_inode_in_use(object, file);
fput(file);
- dput(dentry);
if (ret == -ESTALE)
return cachefiles_create_file(object);
return false;
@@ -611,7 +609,6 @@ error_fput:
fput(file);
error:
cachefiles_do_unmark_inode_in_use(object, d_inode(dentry));
- dput(dentry);
return false;
}
@@ -654,7 +651,9 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
goto new_file;
}
- if (!cachefiles_open_file(object, dentry))
+ ret = cachefiles_open_file(object, dentry);
+ dput(dentry);
+ if (!ret)
return false;
_leave(" = t [%lu]", file_inode(object->file)->i_ino);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5d9ccda098cc..53fef258c2bc 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -96,7 +96,6 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
- BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 808c9c048276..bed34fc11c91 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/iversion.h>
#include <linux/filelock.h>
+#include <linux/jiffies.h>
#include "super.h"
#include "mds_client.h"
@@ -4149,7 +4150,7 @@ retry:
ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
} else if (tsession) {
- /* add placeholder for the export tagert */
+ /* add placeholder for the export target */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
@@ -4602,7 +4603,7 @@ flush_cap_releases:
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
goto done;
bad:
@@ -4659,7 +4660,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
* slowness doesn't block mdsc delayed work,
* preventing send_renew_caps() from running.
*/
- if (jiffies - loop_start >= 5 * HZ)
+ if (time_after_eq(jiffies, loop_start + 5 * HZ))
break;
}
spin_unlock(&mdsc->cap_delay_lock);
@@ -4701,6 +4702,28 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
+/*
+ * Flush all cap releases to the mds
+ */
+static void flush_cap_releases(struct ceph_mds_session *s)
+{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&s->s_cap_lock);
+ if (s->s_num_cap_releases)
+ ceph_flush_session_cap_releases(mdsc, s);
+ spin_unlock(&s->s_cap_lock);
+ doutc(cl, "done\n");
+
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true);
+}
+
void __ceph_touch_fmode(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc, int fmode)
{
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ddec8c9244ee..952109292d69 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2058,7 +2058,7 @@ static int ceph_d_delete(const struct dentry *dentry)
return 0;
if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
return 0;
- /* vaild lease? */
+ /* valid lease? */
di = ceph_dentry(dentry);
if (di) {
if (__dentry_lease_is_valid(di))
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4a8eec46254b..315ef02f9a3f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1779,7 +1779,7 @@ retry_lookup:
if (err < 0)
goto done;
} else if (rinfo->head->is_dentry && req->r_dentry) {
- /* parent inode is not locked, be carefull */
+ /* parent inode is not locked, be careful */
struct ceph_vino *ptvino = NULL;
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 276e34ab3e2c..c4a5fd94bbbb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2266,7 +2266,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
trim_caps - remaining);
}
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
return 0;
}
@@ -2420,7 +2420,7 @@ static void ceph_cap_release_work(struct work_struct *work)
ceph_put_mds_session(session);
}
-void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_client *cl = mdsc->fsc->client;
@@ -2447,7 +2447,7 @@ void __ceph_queue_cap_release(struct ceph_mds_session *session,
session->s_num_cap_releases++;
if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
- ceph_flush_cap_releases(session->s_mdsc, session);
+ ceph_flush_session_cap_releases(session->s_mdsc, session);
}
static void ceph_cap_reclaim_work(struct work_struct *work)
@@ -4340,7 +4340,7 @@ skip_cap_auths:
/* flush cap releases */
spin_lock(&session->s_cap_lock);
if (session->s_num_cap_releases)
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
spin_unlock(&session->s_cap_lock);
send_flushmsg_ack(mdsc, session, seq);
@@ -4910,7 +4910,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
} else {
recon_state.msg_version = 2;
}
- /* trsaverse this session's caps */
+ /* traverse this session's caps */
err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
@@ -5446,7 +5446,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_flush_cap_releases(mdsc, s);
+ ceph_flush_session_cap_releases(mdsc, s);
mutex_lock(&s->s_mutex);
if (renew_caps)
@@ -5877,6 +5877,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_flush_dirty_caps(mdsc);
+ ceph_flush_cap_releases(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
if (!list_empty(&mdsc->cap_flush_list)) {
@@ -6015,6 +6016,18 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
ceph_caps_finalize(mdsc);
+
+ if (mdsc->s_cap_auths) {
+ int i;
+
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ kfree(mdsc->s_cap_auths[i].match.gids);
+ kfree(mdsc->s_cap_auths[i].match.path);
+ kfree(mdsc->s_cap_auths[i].match.fs_name);
+ }
+ kfree(mdsc->s_cap_auths);
+ }
+
ceph_pool_perm_destroy(mdsc);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 9bcc7f181bfe..3dd54587944a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -559,9 +559,6 @@ extern struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
-extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg, int mds);
-
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
@@ -602,8 +599,8 @@ extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
-extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
+extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0cdf84cd1791..73f321b52895 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -126,6 +126,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
if (!wait) {
doutc(cl, "(non-blocking)\n");
ceph_flush_dirty_caps(fsc->mdsc);
+ ceph_flush_cap_releases(fsc->mdsc);
doutc(cl, "(non-blocking) done\n");
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6e817bf1337c..2508aa8950b7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1056,8 +1056,6 @@ extern int ceph_fill_trace(struct super_block *sb,
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
-extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
@@ -1208,10 +1206,6 @@ static inline void ceph_init_inode_acls(struct inode *inode,
struct ceph_acl_sec_ctx *as_ctx)
{
}
-static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
-{
- return 0;
-}
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
@@ -1270,6 +1264,7 @@ extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
diff --git a/fs/coredump.c b/fs/coredump.c
index 53a78b6bbb5b..45737b43dda5 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -465,17 +465,7 @@ static bool dump_interrupted(void)
* but then we need to teach dump_write() to restart and clear
* TIF_SIGPENDING.
*/
- if (fatal_signal_pending(current)) {
- coredump_report_failure("interrupted: fatal signal pending");
- return true;
- }
-
- if (freezing(current)) {
- coredump_report_failure("interrupted: freezing");
- return true;
- }
-
- return false;
+ return fatal_signal_pending(current) || freezing(current);
}
static void wait_for_dump_helpers(struct file *file)
@@ -530,7 +520,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-int do_coredump(const kernel_siginfo_t *siginfo)
+void do_coredump(const kernel_siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
@@ -538,7 +528,7 @@ int do_coredump(const kernel_siginfo_t *siginfo)
struct linux_binfmt * binfmt;
const struct cred *old_cred;
struct cred *cred;
- int retval;
+ int retval = 0;
int ispipe;
size_t *argv = NULL;
int argc = 0;
@@ -562,20 +552,14 @@ int do_coredump(const kernel_siginfo_t *siginfo)
audit_core_dumps(siginfo->si_signo);
binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump) {
- retval = -ENOEXEC;
+ if (!binfmt || !binfmt->core_dump)
goto fail;
- }
- if (!__get_dumpable(cprm.mm_flags)) {
- retval = -EACCES;
+ if (!__get_dumpable(cprm.mm_flags))
goto fail;
- }
cred = prepare_creds();
- if (!cred) {
- retval = -EPERM;
+ if (!cred)
goto fail;
- }
/*
* We cannot trust fsuid as being the "true" uid of the process
* nor do we know its entire history. We only know it was tainted
@@ -604,7 +588,6 @@ int do_coredump(const kernel_siginfo_t *siginfo)
if (ispipe < 0) {
coredump_report_failure("format_corename failed, aborting core");
- retval = ispipe;
goto fail_unlock;
}
@@ -625,7 +608,6 @@ int do_coredump(const kernel_siginfo_t *siginfo)
* core_pattern process dies.
*/
coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
- retval = -EPERM;
goto fail_unlock;
}
cprm.limit = RLIM_INFINITY;
@@ -633,7 +615,6 @@ int do_coredump(const kernel_siginfo_t *siginfo)
dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
coredump_report_failure("over core_pipe_limit, skipping core dump");
- retval = -E2BIG;
goto fail_dropcount;
}
@@ -641,7 +622,6 @@ int do_coredump(const kernel_siginfo_t *siginfo)
GFP_KERNEL);
if (!helper_argv) {
coredump_report_failure("%s failed to allocate memory", __func__);
- retval = -ENOMEM;
goto fail_dropcount;
}
for (argi = 0; argi < argc; argi++)
@@ -667,16 +647,12 @@ int do_coredump(const kernel_siginfo_t *siginfo)
int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
O_LARGEFILE | O_EXCL;
- if (cprm.limit < binfmt->min_coredump) {
- coredump_report_failure("over coredump resource limit, skipping core dump");
- retval = -E2BIG;
+ if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
- }
if (need_suid_safe && cn.corename[0] != '/') {
coredump_report_failure(
"this process can only dump core to a fully qualified path, skipping core dump");
- retval = -EPERM;
goto fail_unlock;
}
@@ -722,28 +698,20 @@ int do_coredump(const kernel_siginfo_t *siginfo)
} else {
cprm.file = filp_open(cn.corename, open_flags, 0600);
}
- if (IS_ERR(cprm.file)) {
- retval = PTR_ERR(cprm.file);
+ if (IS_ERR(cprm.file))
goto fail_unlock;
- }
inode = file_inode(cprm.file);
- if (inode->i_nlink > 1) {
- retval = -EMLINK;
+ if (inode->i_nlink > 1)
goto close_fail;
- }
- if (d_unhashed(cprm.file->f_path.dentry)) {
- retval = -EEXIST;
+ if (d_unhashed(cprm.file->f_path.dentry))
goto close_fail;
- }
/*
* AK: actually i see no reason to not allow this for named
* pipes etc, but keep the previous behaviour for now.
*/
- if (!S_ISREG(inode->i_mode)) {
- retval = -EISDIR;
+ if (!S_ISREG(inode->i_mode))
goto close_fail;
- }
/*
* Don't dump core if the filesystem changed owner or mode
* of the file during file creation. This is an issue when
@@ -755,22 +723,17 @@ int do_coredump(const kernel_siginfo_t *siginfo)
current_fsuid())) {
coredump_report_failure("Core dump to %s aborted: "
"cannot preserve file owner", cn.corename);
- retval = -EPERM;
goto close_fail;
}
if ((inode->i_mode & 0677) != 0600) {
coredump_report_failure("Core dump to %s aborted: "
"cannot preserve file permissions", cn.corename);
- retval = -EPERM;
goto close_fail;
}
- if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) {
- retval = -EACCES;
+ if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
- }
- retval = do_truncate(idmap, cprm.file->f_path.dentry,
- 0, 0, cprm.file);
- if (retval)
+ if (do_truncate(idmap, cprm.file->f_path.dentry,
+ 0, 0, cprm.file))
goto close_fail;
}
@@ -786,15 +749,10 @@ int do_coredump(const kernel_siginfo_t *siginfo)
*/
if (!cprm.file) {
coredump_report_failure("Core dump to |%s disabled", cn.corename);
- retval = -EPERM;
goto close_fail;
}
- if (!dump_vma_snapshot(&cprm)) {
- coredump_report_failure("Can't get VMA snapshot for core dump |%s",
- cn.corename);
- retval = -EACCES;
+ if (!dump_vma_snapshot(&cprm))
goto close_fail;
- }
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
@@ -810,21 +768,9 @@ int do_coredump(const kernel_siginfo_t *siginfo)
}
file_end_write(cprm.file);
free_vma_snapshot(&cprm);
- } else {
- coredump_report_failure("Core dump to %s%s has been interrupted",
- ispipe ? "|" : "", cn.corename);
- retval = -EAGAIN;
- goto fail;
}
- coredump_report(
- "written to %s%s: VMAs: %d, size %zu; core: %lld bytes, pos %lld",
- ispipe ? "|" : "", cn.corename,
- cprm.vma_count, cprm.vma_data_size, cprm.written, cprm.pos);
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
-
- retval = 0;
-
close_fail:
if (cprm.file)
filp_close(cprm.file, NULL);
@@ -839,7 +785,7 @@ fail_unlock:
fail_creds:
put_cred(cred);
fail:
- return retval;
+ return;
}
/*
@@ -859,16 +805,8 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
- if (n != nr) {
- if (n < 0)
- coredump_report_failure("failed when writing out, error %zd", n);
- else
- coredump_report_failure(
- "partially written out, only %zd(of %d) bytes written",
- n, nr);
-
+ if (n != nr)
return 0;
- }
file->f_pos = pos;
cprm->written += n;
cprm->pos += n;
@@ -881,16 +819,9 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
if (file->f_mode & FMODE_LSEEK) {
- int ret;
-
- if (dump_interrupted())
+ if (dump_interrupted() ||
+ vfs_llseek(file, nr, SEEK_CUR) < 0)
return 0;
-
- ret = vfs_llseek(file, nr, SEEK_CUR);
- if (ret < 0) {
- coredump_report_failure("failed when seeking, error %d", ret);
- return 0;
- }
cprm->pos += nr;
return 1;
} else {
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c6f4a9a98b85..67299e8b734e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1218,7 +1218,6 @@ static const struct file_operations u32_array_fops = {
.open = u32_array_open,
.release = u32_array_release,
.read = u32_array_read,
- .llseek = no_llseek,
};
/**
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 7112958c2e5b..700a0cbb2f14 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -733,7 +733,6 @@ out:
static const struct file_operations dlm_rawmsg_fops = {
.open = simple_open,
.write = dlm_rawmsg_write,
- .llseek = no_llseek,
};
void *dlm_create_debug_comms_file(int nodeid, void *data)
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 7e9961639802..23c51d62f902 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -110,5 +110,4 @@ const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
- .llseek = no_llseek,
};
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 9afdb722fa92..22c934f3a080 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,9 +349,9 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
struct eventfd_ctx *ctx;
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
- ctx = eventfd_ctx_fileget(f.file);
+ ctx = eventfd_ctx_fileget(fd_file(f));
fdput(f);
return ctx;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 145f5349c612..1ae4542f0bd8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2261,17 +2261,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error = -EBADF;
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd);
- if (!tf.file)
+ if (!fd_file(tf))
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
- if (!file_can_poll(tf.file))
+ if (!file_can_poll(fd_file(tf)))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
@@ -2284,7 +2284,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
- if (f.file == tf.file || !is_file_epoll(f.file))
+ if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
goto error_tgt_fput;
/*
@@ -2295,7 +2295,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@@ -2304,7 +2304,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/*
* When we insert an epoll file descriptor inside another epoll file
@@ -2325,16 +2325,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(tf.file)) {
+ if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+ is_file_epoll(fd_file(tf))) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
if (error)
goto error_tgt_fput;
loop_check_gen++;
full_check = 1;
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
+ if (is_file_epoll(fd_file(tf))) {
+ tep = fd_file(tf)->private_data;
error = -ELOOP;
if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
@@ -2350,14 +2350,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
- epi = ep_find(ep, tf.file, fd);
+ epi = ep_find(ep, fd_file(tf), fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, epds, tf.file, fd, full_check);
+ error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
} else
error = -EEXIST;
break;
@@ -2438,7 +2438,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
/*
@@ -2446,14 +2446,14 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
- if (!is_file_epoll(f.file))
+ if (!is_file_epoll(fd_file(f)))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
- ep = f.file->private_data;
+ ep = fd_file(f)->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, to);
diff --git a/fs/exec.c b/fs/exec.c
index dad402d55681..6c53920795c2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -710,80 +710,6 @@ static int copy_strings_kernel(int argc, const char *const *argv,
#ifdef CONFIG_MMU
/*
- * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
- * the binfmt code determines where the new stack should reside, we shift it to
- * its final location. The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- if (length != move_page_tables(vma, old_start,
- vma, new_start, length, false, true))
- return -ENOMEM;
-
- lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
-/*
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
* the stack is optionally relocated, and some extra space is added.
*/
@@ -876,7 +802,12 @@ int setup_arg_pages(struct linux_binprm *bprm,
/* Move stack pages down in memory. */
if (stack_shift) {
- ret = shift_arg_pages(vma, stack_shift);
+ /*
+ * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
+ * the binfmt code determines where the new stack should reside, we shift it to
+ * its final location.
+ */
+ ret = relocate_vma_down(vma, stack_shift);
if (ret)
goto out_unlock;
}
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 0356c88252bd..ce9be95c9172 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -91,11 +91,8 @@ int exfat_load_bitmap(struct super_block *sb)
return -EIO;
type = exfat_get_entry_type(ep);
- if (type == TYPE_UNUSED)
- break;
- if (type != TYPE_BITMAP)
- continue;
- if (ep->dentry.bitmap.flags == 0x0) {
+ if (type == TYPE_BITMAP &&
+ ep->dentry.bitmap.flags == 0x0) {
int err;
err = exfat_allocate_bitmap(sb, ep);
@@ -103,6 +100,9 @@ int exfat_load_bitmap(struct super_block *sb)
return err;
}
brelse(bh);
+
+ if (type == TYPE_UNUSED)
+ return -EINVAL;
}
if (exfat_get_next_cluster(sb, &clu.dir))
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index ecc5db952deb..3cdc1de362a9 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -10,6 +10,7 @@
#include <linux/ratelimit.h>
#include <linux/nls.h>
#include <linux/blkdev.h>
+#include <uapi/linux/exfat.h>
#define EXFAT_ROOT_INO 1
@@ -148,6 +149,9 @@ enum {
#define DIR_CACHE_SIZE \
(DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1)
+/* Superblock flags */
+#define EXFAT_FLAGS_SHUTDOWN 1
+
struct exfat_dentry_namebuf {
char *lfn;
int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
@@ -267,6 +271,8 @@ struct exfat_sb_info {
unsigned int clu_srch_ptr; /* cluster search pointer */
unsigned int used_clusters; /* number of used clusters */
+ unsigned long s_exfat_flags; /* Exfat superblock flags */
+
struct mutex s_lock; /* superblock lock */
struct mutex bitmap_lock; /* bitmap lock */
struct exfat_mount_options options;
@@ -309,13 +315,6 @@ struct exfat_inode_info {
/* for avoiding the race between alloc and free */
unsigned int cache_valid_id;
- /*
- * NOTE: i_size_ondisk is 64bits, so must hold ->inode_lock to access.
- * physically allocated size.
- */
- loff_t i_size_ondisk;
- /* block-aligned i_size (used in cont_write_begin) */
- loff_t i_size_aligned;
/* on-disk position of directory entry or 0 */
loff_t i_pos;
loff_t valid_size;
@@ -338,6 +337,11 @@ static inline struct exfat_inode_info *EXFAT_I(struct inode *inode)
return container_of(inode, struct exfat_inode_info, vfs_inode);
}
+static inline int exfat_forced_shutdown(struct super_block *sb)
+{
+ return test_bit(EXFAT_FLAGS_SHUTDOWN, &EXFAT_SB(sb)->s_exfat_flags);
+}
+
/*
* If ->i_mode can't hold 0222 (i.e. ATTR_RO), we use ->i_attrs to
* save ATTR_RO instead of ->i_mode.
@@ -417,6 +421,11 @@ static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters;
}
+static inline loff_t exfat_ondisk_size(const struct inode *inode)
+{
+ return ((loff_t)inode->i_blocks) << 9;
+}
+
/* super.c */
int exfat_set_volume_dirty(struct super_block *sb);
int exfat_clear_volume_dirty(struct super_block *sb);
@@ -461,6 +470,7 @@ int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long exfat_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
+int exfat_force_shutdown(struct super_block *sb, u32 flags);
/* namei.c */
extern const struct dentry_operations exfat_dentry_ops;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index e19469e88000..a25d7eb789f4 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -29,7 +29,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (ret)
return ret;
- num_clusters = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+ num_clusters = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi);
new_num_clusters = EXFAT_B_TO_CLU_ROUND_UP(size, sbi);
if (new_num_clusters == num_clusters)
@@ -74,8 +74,6 @@ out:
/* Expanded range not zeroed, do not update valid_size */
i_size_write(inode, size);
- ei->i_size_aligned = round_up(size, sb->s_blocksize);
- ei->i_size_ondisk = ei->i_size_aligned;
inode->i_blocks = round_up(size, sbi->cluster_size) >> 9;
mark_inode_dirty(inode);
@@ -159,7 +157,7 @@ int __exfat_truncate(struct inode *inode)
exfat_set_volume_dirty(sb);
num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi);
- num_clusters_phys = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+ num_clusters_phys = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi);
exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
@@ -245,8 +243,6 @@ void exfat_truncate(struct inode *inode)
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- unsigned int blocksize = i_blocksize(inode);
- loff_t aligned_size;
int err;
mutex_lock(&sbi->s_lock);
@@ -264,17 +260,6 @@ void exfat_truncate(struct inode *inode)
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
write_size:
- aligned_size = i_size_read(inode);
- if (aligned_size & (blocksize - 1)) {
- aligned_size |= (blocksize - 1);
- aligned_size++;
- }
-
- if (ei->i_size_ondisk > i_size_read(inode))
- ei->i_size_ondisk = aligned_size;
-
- if (ei->i_size_aligned > i_size_read(inode))
- ei->i_size_aligned = aligned_size;
mutex_unlock(&sbi->s_lock);
}
@@ -302,6 +287,9 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
unsigned int ia_valid;
int error;
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size > i_size_read(inode)) {
error = exfat_cont_expand(inode, attr->ia_size);
@@ -485,6 +473,19 @@ static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg)
return 0;
}
+static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg)
+{
+ u32 flags;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(flags, (__u32 __user *)arg))
+ return -EFAULT;
+
+ return exfat_force_shutdown(sb, flags);
+}
+
long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -495,6 +496,8 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return exfat_ioctl_get_attributes(inode, user_attr);
case FAT_IOCTL_SET_ATTRIBUTES:
return exfat_ioctl_set_attributes(filp, user_attr);
+ case EXFAT_IOC_SHUTDOWN:
+ return exfat_ioctl_shutdown(inode->i_sb, arg);
case FITRIM:
return exfat_ioctl_fitrim(inode, arg);
default:
@@ -515,6 +518,9 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct inode *inode = filp->f_mapping->host;
int err;
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
err = __generic_file_fsync(filp, start, end, datasync);
if (err)
return err;
@@ -526,32 +532,32 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
-static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end)
+static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
{
int err;
+ loff_t pos;
struct inode *inode = file_inode(file);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *ops = mapping->a_ops;
- while (start < end) {
- u32 zerofrom, len;
+ pos = ei->valid_size;
+ while (pos < new_valid_size) {
+ u32 len;
struct folio *folio;
- zerofrom = start & (PAGE_SIZE - 1);
- len = PAGE_SIZE - zerofrom;
- if (start + len > end)
- len = end - start;
+ len = PAGE_SIZE - (pos & (PAGE_SIZE - 1));
+ if (pos + len > new_valid_size)
+ len = new_valid_size - pos;
- err = ops->write_begin(file, mapping, start, len, &folio, NULL);
+ err = ops->write_begin(file, mapping, pos, len, &folio, NULL);
if (err)
goto out;
- folio_zero_range(folio, offset_in_folio(folio, start), len);
-
- err = ops->write_end(file, mapping, start, len, len, folio, NULL);
+ err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
- start += len;
+ pos += len;
balance_dirty_pages_ratelimited(mapping);
cond_resched();
@@ -579,7 +585,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
goto unlock;
if (pos > valid_size) {
- ret = exfat_file_zeroed_range(file, valid_size, pos);
+ ret = exfat_extend_valid_size(file, pos);
if (ret < 0 && ret != -ENOSPC) {
exfat_err(inode->i_sb,
"write: fail to zero from %llu to %llu(%zd)",
@@ -613,26 +619,46 @@ unlock:
return ret;
}
-static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
{
- int ret;
+ int err;
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct exfat_inode_info *ei = EXFAT_I(inode);
- loff_t start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- loff_t end = min_t(loff_t, i_size_read(inode),
+ loff_t start, end;
+
+ if (!inode_trylock(inode))
+ return VM_FAULT_RETRY;
+
+ start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ end = min_t(loff_t, i_size_read(inode),
start + vma->vm_end - vma->vm_start);
- if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) {
- ret = exfat_file_zeroed_range(file, ei->valid_size, end);
- if (ret < 0) {
- exfat_err(inode->i_sb,
- "mmap: fail to zero from %llu to %llu(%d)",
- start, end, ret);
- return ret;
+ if (ei->valid_size < end) {
+ err = exfat_extend_valid_size(file, end);
+ if (err < 0) {
+ inode_unlock(inode);
+ return vmf_fs_error(err);
}
}
- return generic_file_mmap(file, vma);
+ inode_unlock(inode);
+
+ return filemap_page_mkwrite(vmf);
+}
+
+static const struct vm_operations_struct exfat_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = exfat_page_mkwrite,
+};
+
+static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &exfat_file_vm_ops;
+ return 0;
}
const struct file_operations exfat_file_operations = {
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 05f0e07b01d0..d724de8f57bf 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -102,6 +102,9 @@ int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret;
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
@@ -130,11 +133,9 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
unsigned int local_clu_offset = clu_offset;
- unsigned int num_to_be_allocated = 0, num_clusters = 0;
+ unsigned int num_to_be_allocated = 0, num_clusters;
- if (ei->i_size_ondisk > 0)
- num_clusters =
- EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+ num_clusters = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi);
if (clu_offset >= num_clusters)
num_to_be_allocated = clu_offset - num_clusters + 1;
@@ -260,21 +261,6 @@ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset,
return 0;
}
-static int exfat_map_new_buffer(struct exfat_inode_info *ei,
- struct buffer_head *bh, loff_t pos)
-{
- if (buffer_delay(bh) && pos > ei->i_size_aligned)
- return -EIO;
- set_buffer_new(bh);
-
- /*
- * Adjust i_size_aligned if i_size_ondisk is bigger than it.
- */
- if (ei->i_size_ondisk > ei->i_size_aligned)
- ei->i_size_aligned = ei->i_size_ondisk;
- return 0;
-}
-
static int exfat_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -288,7 +274,6 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
sector_t last_block;
sector_t phys = 0;
sector_t valid_blks;
- loff_t pos;
mutex_lock(&sbi->s_lock);
last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb);
@@ -316,12 +301,6 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
mapped_blocks = sbi->sect_per_clus - sec_offset;
max_blocks = min(mapped_blocks, max_blocks);
- pos = EXFAT_BLK_TO_B((iblock + 1), sb);
- if ((create && iblock >= last_block) || buffer_delay(bh_result)) {
- if (ei->i_size_ondisk < pos)
- ei->i_size_ondisk = pos;
- }
-
map_bh(bh_result, sb, phys);
if (buffer_delay(bh_result))
clear_buffer_delay(bh_result);
@@ -342,13 +321,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
}
/* The area has not been written, map and mark as new. */
- err = exfat_map_new_buffer(ei, bh_result, pos);
- if (err) {
- exfat_fs_error(sb,
- "requested for bmap out of range(pos : (%llu) > i_size_aligned(%llu)\n",
- pos, ei->i_size_aligned);
- goto unlock_ret;
- }
+ set_buffer_new(bh_result);
ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb);
mark_inode_dirty(inode);
@@ -371,7 +344,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
* The block has been partially written,
* zero the unwritten part and map the block.
*/
- loff_t size, off;
+ loff_t size, off, pos;
max_blocks = 1;
@@ -382,7 +355,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (!bh_result->b_folio)
goto done;
- pos -= sb->s_blocksize;
+ pos = EXFAT_BLK_TO_B(iblock, sb);
size = ei->valid_size - pos;
off = pos & (PAGE_SIZE - 1);
@@ -432,6 +405,9 @@ static void exfat_readahead(struct readahead_control *rac)
static int exfat_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ if (unlikely(exfat_forced_shutdown(mapping->host->i_sb)))
+ return -EIO;
+
return mpage_writepages(mapping, wbc, exfat_get_block);
}
@@ -452,6 +428,9 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
{
int ret;
+ if (unlikely(exfat_forced_shutdown(mapping->host->i_sb)))
+ return -EIO;
+
ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block);
if (ret < 0)
@@ -469,14 +448,6 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
int err;
err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
-
- if (ei->i_size_aligned < i_size_read(inode)) {
- exfat_fs_error(inode->i_sb,
- "invalid size(size(%llu) > aligned(%llu)\n",
- i_size_read(inode), ei->i_size_aligned);
- return -EIO;
- }
-
if (err < len)
exfat_write_failed(mapping, pos+len);
@@ -504,20 +475,6 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
int rw = iov_iter_rw(iter);
ssize_t ret;
- if (rw == WRITE) {
- /*
- * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
- * so we need to update the ->i_size_aligned to block boundary.
- *
- * But we must fill the remaining area or hole by nul for
- * updating ->i_size_aligned
- *
- * Return 0, and fallback to normal buffered write.
- */
- if (EXFAT_I(inode)->i_size_aligned < size)
- return 0;
- }
-
/*
* Need to use the DIO_LOCKING for avoiding the race
* condition of exfat_get_block() and ->truncate().
@@ -531,8 +488,18 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else
size = pos + ret;
- /* zero the unwritten part in the partially written block */
- if (rw == READ && pos < ei->valid_size && ei->valid_size < size) {
+ if (rw == WRITE) {
+ /*
+ * If the block had been partially written before this write,
+ * ->valid_size will not be updated in exfat_get_block(),
+ * update it here.
+ */
+ if (ei->valid_size < size) {
+ ei->valid_size = size;
+ mark_inode_dirty(inode);
+ }
+ } else if (pos < ei->valid_size && ei->valid_size < size) {
+ /* zero the unwritten part in the partially written block */
iov_iter_revert(iter, size - ei->valid_size);
iov_iter_zero(size - ei->valid_size, iter);
}
@@ -667,15 +634,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
i_size_write(inode, size);
- /* ondisk and aligned size should be aligned with block size */
- if (size & (inode->i_sb->s_blocksize - 1)) {
- size |= (inode->i_sb->s_blocksize - 1);
- size++;
- }
-
- ei->i_size_aligned = size;
- ei->i_size_ondisk = size;
-
exfat_save_attr(inode, info->attr);
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 631ad9e8e32a..2c4c44229352 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -372,8 +372,6 @@ static int exfat_find_empty_entry(struct inode *inode,
/* directory inode should be updated in here */
i_size_write(inode, size);
- ei->i_size_ondisk += sbi->cluster_size;
- ei->i_size_aligned += sbi->cluster_size;
ei->valid_size += sbi->cluster_size;
ei->flags = p_dir->flags;
inode->i_blocks += sbi->cluster_size >> 9;
@@ -549,6 +547,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
int err;
loff_t size = i_size_read(dir);
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return -EIO;
+
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE,
@@ -772,6 +773,9 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
struct exfat_entry_set_cache es;
int entry, err = 0;
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return -EIO;
+
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_chain_dup(&cdir, &ei->dir);
entry = ei->entry;
@@ -825,6 +829,9 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
int err;
loff_t size = i_size_read(dir);
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return -EIO;
+
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR,
@@ -915,6 +922,9 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
struct exfat_entry_set_cache es;
int entry, err;
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return -EIO;
+
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
exfat_chain_dup(&cdir, &ei->dir);
@@ -982,6 +992,9 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
struct exfat_entry_set_cache old_es, new_es;
int sync = IS_DIRSYNC(inode);
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return -EIO;
+
num_new_entries = exfat_calc_num_entries(p_uniname);
if (num_new_entries < 0)
return num_new_entries;
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index afdf13c34ff5..1ac011088ce7 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -779,8 +779,11 @@ int exfat_create_upcase_table(struct super_block *sb)
le32_to_cpu(ep->dentry.upcase.checksum));
brelse(bh);
- if (ret && ret != -EIO)
+ if (ret && ret != -EIO) {
+ /* free memory from exfat_load_upcase_table call */
+ exfat_free_upcase_table(sbi);
goto load_default;
+ }
/* load successfully */
return ret;
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 323ecebe6f0e..bd57844414aa 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -46,6 +46,9 @@ static int exfat_sync_fs(struct super_block *sb, int wait)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int err = 0;
+ if (unlikely(exfat_forced_shutdown(sb)))
+ return 0;
+
if (!wait)
return 0;
@@ -167,6 +170,41 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+int exfat_force_shutdown(struct super_block *sb, u32 flags)
+{
+ int ret;
+ struct exfat_sb_info *sbi = sb->s_fs_info;
+ struct exfat_mount_options *opts = &sbi->options;
+
+ if (exfat_forced_shutdown(sb))
+ return 0;
+
+ switch (flags) {
+ case EXFAT_GOING_DOWN_DEFAULT:
+ case EXFAT_GOING_DOWN_FULLSYNC:
+ ret = bdev_freeze(sb->s_bdev);
+ if (ret)
+ return ret;
+ bdev_thaw(sb->s_bdev);
+ set_bit(EXFAT_FLAGS_SHUTDOWN, &sbi->s_exfat_flags);
+ break;
+ case EXFAT_GOING_DOWN_NOSYNC:
+ set_bit(EXFAT_FLAGS_SHUTDOWN, &sbi->s_exfat_flags);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (opts->discard)
+ opts->discard = 0;
+ return 0;
+}
+
+static void exfat_shutdown(struct super_block *sb)
+{
+ exfat_force_shutdown(sb, EXFAT_GOING_DOWN_NOSYNC);
+}
+
static struct inode *exfat_alloc_inode(struct super_block *sb)
{
struct exfat_inode_info *ei;
@@ -193,6 +231,7 @@ static const struct super_operations exfat_sops = {
.sync_fs = exfat_sync_fs,
.statfs = exfat_statfs,
.show_options = exfat_show_options,
+ .shutdown = exfat_shutdown,
};
enum {
@@ -370,8 +409,6 @@ static int exfat_read_root(struct inode *inode)
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
ei->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff;
- ei->i_size_aligned = i_size_read(inode);
- ei->i_size_ondisk = i_size_read(inode);
exfat_save_attr(inode, EXFAT_ATTR_SUBDIR);
ei->i_crtime = simple_inode_init_ts(inode);
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index cd725bebe69e..2a135075468d 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -18,15 +18,17 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
if (!ext4_has_metadata_csum(sb))
return 1;
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
@@ -40,14 +42,16 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz)
+ struct buffer_head *bh)
{
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int sz;
if (!ext4_has_metadata_csum(sb))
return;
+ sz = EXT4_INODES_PER_GROUP(sb) >> 3;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 13196afe55ce..ef6a3c8f3a9a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -280,12 +280,20 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str de_name =
FSTR_INIT(de->name,
de->name_len);
+ u32 hash;
+ u32 minor_hash;
+
+ if (IS_CASEFOLDED(inode)) {
+ hash = EXT4_DIRENT_HASH(de);
+ minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ } else {
+ hash = 0;
+ minor_hash = 0;
+ }
/* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode,
- EXT4_DIRENT_HASH(de),
- EXT4_DIRENT_MINOR_HASH(de),
- &de_name, &fstr);
+ hash, minor_hash, &de_name, &fstr);
de_name = fstr;
fstr.len = save_len;
if (err)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ecc15e5f1eba..44b0d418143c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1058,6 +1058,7 @@ struct ext4_inode_info {
/* Number of ongoing updates on this inode */
atomic_t i_fc_updates;
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
@@ -1106,6 +1107,10 @@ struct ext4_inode_info {
/* mballoc */
atomic_t i_prealloc_active;
+
+ /* allocation reservation info for delalloc */
+ /* In case of bigalloc, this refer to clusters rather than blocks */
+ unsigned int i_reserved_data_blocks;
struct rb_root i_prealloc_node;
rwlock_t i_prealloc_lock;
@@ -1122,10 +1127,6 @@ struct ext4_inode_info {
/* ialloc */
ext4_group_t i_last_alloc_group;
- /* allocation reservation info for delalloc */
- /* In case of bigalloc, this refer to clusters rather than blocks */
- unsigned int i_reserved_data_blocks;
-
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1149,7 +1150,6 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -2338,9 +2338,9 @@ struct ext4_dir_entry_2 {
((struct ext4_dir_entry_hash *) \
(((void *)(entry)) + \
((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
-#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
+#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
- le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)
+ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)
static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
@@ -2462,6 +2462,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5
#define DX_HASH_SIPHASH 6
+#define DX_HASH_LAST DX_HASH_SIPHASH
static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
const void *address, unsigned int length)
@@ -2695,10 +2696,10 @@ struct mmpd_data {
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
- struct buffer_head *bh, int sz);
+ struct buffer_head *bh);
void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
@@ -3712,11 +3713,12 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *,
- struct ext4_ext_path **,
- struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_insert_extent(
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path **,
+ struct ext4_ext_path *,
int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
@@ -3853,6 +3855,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
+extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block);
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e067f2dd0335..34e25eee6521 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -84,12 +84,11 @@ static void ext4_extent_block_csum_set(struct inode *inode,
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags);
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
@@ -106,21 +105,27 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
return 0;
}
+static inline void ext4_ext_path_brelse(struct ext4_ext_path *path)
+{
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+}
+
static void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
int depth, i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
+ for (i = 0; i <= depth; i++, path++)
+ ext4_ext_path_brelse(path);
}
void ext4_free_ext_path(struct ext4_ext_path *path)
{
+ if (IS_ERR_OR_NULL(path))
+ return;
ext4_ext_drop_refs(path);
kfree(path);
}
@@ -323,19 +328,18 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
return size;
}
-static inline int
+static inline struct ext4_ext_path *
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+ struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
- struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
- return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
+ return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
}
@@ -635,8 +639,7 @@ int ext4_ext_precache(struct inode *inode)
*/
if ((i == depth) ||
path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
@@ -689,7 +692,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
struct ext4_extent *ex;
int i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
eh = path[depth].p_hdr;
@@ -881,11 +884,10 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path **orig_path, int flags)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
- struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
gfp_t gfp_flags = GFP_NOFS;
@@ -906,7 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
ext4_ext_drop_refs(path);
if (depth > path[0].p_maxdepth) {
kfree(path);
- *orig_path = path = NULL;
+ path = NULL;
}
}
if (!path) {
@@ -961,8 +963,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
err:
ext4_free_ext_path(path);
- if (orig_path)
- *orig_path = NULL;
return ERR_PTR(ret);
}
@@ -1395,15 +1395,15 @@ out:
* finds empty index and adds new leaf.
* if no free index is found, then it requests in-depth growing.
*/
-static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int mb_flags,
- unsigned int gb_flags,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext)
+static struct ext4_ext_path *
+ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+ unsigned int mb_flags, unsigned int gb_flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_ext_path *curp;
int depth, i, err = 0;
+ ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block);
repeat:
i = depth = ext_depth(inode);
@@ -1422,42 +1422,38 @@ repeat:
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
- goto out;
+ goto errout;
/* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path))
- err = PTR_ERR(path);
- } else {
- /* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, mb_flags);
- if (err)
- goto out;
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ return path;
+ }
- /* refill path */
- path = ext4_find_extent(inode,
- (ext4_lblk_t)le32_to_cpu(newext->ee_block),
- ppath, gb_flags);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
+ /* tree is full, time to grow in depth */
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags);
+ if (err)
+ goto errout;
- /*
- * only first (depth 0 -> 1) produces free space;
- * in all other cases we have to split the grown tree
- */
- depth = ext_depth(inode);
- if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
- /* now we need to split */
- goto repeat;
- }
+ /* refill path */
+ path = ext4_find_extent(inode, ee_block, path, gb_flags);
+ if (IS_ERR(path))
+ return path;
+
+ /*
+ * only first (depth 0 -> 1) produces free space;
+ * in all other cases we have to split the grown tree
+ */
+ depth = ext_depth(inode);
+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+ /* now we need to split */
+ goto repeat;
}
-out:
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -1749,12 +1745,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
break;
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
+ goto clean;
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
return err;
}
@@ -1876,7 +1883,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
path[0].p_hdr->eh_max = cpu_to_le16(max_root);
- brelse(path[1].p_bh);
+ ext4_ext_path_brelse(path + 1);
ext4_free_blocks(handle, inode, NULL, blk, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}
@@ -1964,16 +1971,15 @@ out:
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
-int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_extent *newext, int gb_flags)
+struct ext4_ext_path *
+ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int gb_flags)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
- struct ext4_ext_path *npath = NULL;
- int depth, len, err;
+ int depth, len, err = 0;
ext4_lblk_t next;
int mb_flags = 0, unwritten;
@@ -1981,14 +1987,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
/* try to insert block into found extent and return */
@@ -2026,7 +2034,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
@@ -2051,7 +2059,7 @@ prepend:
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
- return err;
+ goto errout;
unwritten = ext4_ext_is_unwritten(ex);
ex->ee_block = newext->ee_block;
@@ -2076,21 +2084,26 @@ prepend:
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
+ struct ext4_ext_path *npath;
+
ext_debug(inode, "next leaf block - %u\n", next);
- BUG_ON(npath != NULL);
npath = ext4_find_extent(inode, next, NULL, gb_flags);
- if (IS_ERR(npath))
- return PTR_ERR(npath);
+ if (IS_ERR(npath)) {
+ err = PTR_ERR(npath);
+ goto errout;
+ }
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
ext_debug(inode, "next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
+ ext4_free_ext_path(path);
path = npath;
goto has_space;
}
ext_debug(inode, "next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ ext4_free_ext_path(npath);
}
/*
@@ -2099,10 +2112,10 @@ prepend:
*/
if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
mb_flags |= EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
- ppath, newext);
- if (err)
- goto cleanup;
+ path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
@@ -2111,7 +2124,7 @@ has_space:
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto cleanup;
+ goto errout;
if (!nearex) {
/* there is no extent in this leaf, create first one */
@@ -2169,17 +2182,20 @@ merge:
if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(handle, inode, path, nearex);
-
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
if (err)
- goto cleanup;
+ goto errout;
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (err)
+ goto errout;
-cleanup:
- ext4_free_ext_path(npath);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
static int ext4_fill_es_cache_info(struct inode *inode,
@@ -2279,27 +2295,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
{
int err;
ext4_fsblk_t leaf;
+ int k = depth - 1;
/* free index block */
- depth--;
- path = path + depth;
- leaf = ext4_idx_pblock(path->p_idx);
- if (unlikely(path->p_hdr->eh_entries == 0)) {
- EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ leaf = ext4_idx_pblock(path[k].p_idx);
+ if (unlikely(path[k].p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k);
return -EFSCORRUPTED;
}
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
return err;
- if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
- int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) {
+ int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx;
len *= sizeof(struct ext4_extent_idx);
- memmove(path->p_idx, path->p_idx + 1, len);
+ memmove(path[k].p_idx, path[k].p_idx + 1, len);
}
- le16_add_cpu(&path->p_hdr->eh_entries, -1);
- err = ext4_ext_dirty(handle, inode, path);
+ le16_add_cpu(&path[k].p_hdr->eh_entries, -1);
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
return err;
ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
@@ -2308,18 +2323,29 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- while (--depth >= 0) {
- if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+ while (--k >= 0) {
+ if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr))
break;
- path--;
- err = ext4_ext_get_access(handle, inode, path);
+ err = ext4_ext_get_access(handle, inode, path + k);
if (err)
- break;
- path->p_idx->ei_block = (path+1)->p_idx->ei_block;
- err = ext4_ext_dirty(handle, inode, path);
+ goto clean;
+ path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block;
+ err = ext4_ext_dirty(handle, inode, path + k);
if (err)
- break;
+ goto clean;
}
+ return 0;
+
+clean:
+ /*
+ * The path[k].p_bh is either unmodified or with no verified bit
+ * set (see ext4_ext_get_access()). So just clear the verified bit
+ * of the successfully modified extents buffers, which will force
+ * these extents to be checked to avoid using inconsistent data.
+ */
+ while (++k < depth)
+ clear_buffer_verified(path[k].p_bh);
+
return err;
}
@@ -2872,11 +2898,12 @@ again:
* fail removing space due to ENOSPC so try to use
* reserved block if that happens.
*/
- err = ext4_force_split_extent_at(handle, inode, &path,
- end + 1, 1);
- if (err < 0)
+ path = ext4_force_split_extent_at(handle, inode, path,
+ end + 1, 1);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
goto out;
-
+ }
} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
partial.state == initial) {
/*
@@ -2934,8 +2961,7 @@ again:
err = ext4_ext_rm_leaf(handle, inode, path,
&partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
continue;
}
@@ -2997,8 +3023,7 @@ again:
err = ext4_ext_rm_idx(handle, inode, path, i);
}
/* root level has p_bh == NULL, brelse() eats this */
- brelse(path[i].p_bh);
- path[i].p_bh = NULL;
+ ext4_ext_path_brelse(path + i);
i--;
ext_debug(inode, "return to level %d\n", i);
}
@@ -3113,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
return;
ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
+ EXTENT_STATUS_WRITTEN, 0);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -3147,16 +3172,14 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* a> the extent are splitted into two extent.
* b> split is not needed, and just mark the extent.
*
- * return 0 on success.
+ * Return an extent path pointer on success, or an error pointer on failure.
*/
-static int ext4_split_extent_at(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- ext4_lblk_t split,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag, int flags)
{
- struct ext4_ext_path *path = *ppath;
ext4_fsblk_t newblock;
ext4_lblk_t ee_block;
struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3226,10 +3249,31 @@ static int ext4_split_extent_at(handle_t *handle,
if (split_flag & EXT4_EXT_MARK_UNWRIT2)
ext4_ext_mark_unwritten(ex2);
- err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (!IS_ERR(path))
goto out;
+ err = PTR_ERR(path);
+ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ return path;
+
+ /*
+ * Get a new path to try to zeroout or fix the extent length.
+ * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent()
+ * will not return -ENOMEM, otherwise -ENOMEM will cause a
+ * retry in do_writepages(), and a WARN_ON may be triggered
+ * in ext4_da_update_reserve_space() due to an incorrect
+ * ee_len causing the i_reserved_data_blocks exception.
+ */
+ path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL);
+ if (IS_ERR(path)) {
+ EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld",
+ split, PTR_ERR(path));
+ return path;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3280,14 +3324,17 @@ fix_extent_len:
* and err is a non-zero error code.
*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
- return err;
out:
+ if (err) {
+ ext4_free_ext_path(path);
+ path = ERR_PTR(err);
+ }
ext4_ext_show_leaf(inode, path);
- return err;
+ return path;
}
/*
- * ext4_split_extents() splits an extent and mark extent which is covered
+ * ext4_split_extent() splits an extent and mark extent which is covered
* by @map as split_flags indicates
*
* It may result in splitting the extent into multiple extents (up to three)
@@ -3297,21 +3344,18 @@ out:
* c> Splits in three extents: Somone is splitting in middle of the extent
*
*/
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags)
+static struct ext4_ext_path *ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag, int flags,
+ unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
unsigned int ee_len, depth;
- int err = 0;
int unwritten;
int split_flag1, flags1;
- int allocated = map->m_len;
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3327,28 +3371,27 @@ static int ext4_split_extent(handle_t *handle,
EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
- if (err)
- goto out;
- } else {
- allocated = ee_len - (map->m_lblk - ee_block);
- }
- /*
- * Update path is required because previous ext4_split_extent_at() may
- * result in split of original leaf or extent zeroout.
- */
- path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
- if (IS_ERR(path))
- return PTR_ERR(path);
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- if (!ex) {
- EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ if (IS_ERR(path))
+ return path;
+ /*
+ * Update path is required because previous ext4_split_extent_at
+ * may result in split of original leaf or extent zeroout.
+ */
+ path = ext4_find_extent(inode, map->m_lblk, path, flags);
+ if (IS_ERR(path))
+ return path;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ ext4_free_ext_path(path);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
}
- unwritten = ext4_ext_is_unwritten(ex);
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -3357,15 +3400,20 @@ static int ext4_split_extent(handle_t *handle,
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
EXT4_EXT_MARK_UNWRIT2);
}
- err = ext4_split_extent_at(handle, inode, ppath,
+ path = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
- if (err)
- goto out;
+ if (IS_ERR(path))
+ return path;
}
+ if (allocated) {
+ if (map->m_lblk + map->m_len > ee_block + ee_len)
+ *allocated = ee_len - (map->m_lblk - ee_block);
+ else
+ *allocated = map->m_len;
+ }
ext4_ext_show_leaf(inode, path);
-out:
- return err ? err : allocated;
+ return path;
}
/*
@@ -3388,13 +3436,11 @@ out:
* that are allocated and initialized.
* It is guaranteed to be >= map->m_len.
*/
-static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+static struct ext4_ext_path *
+ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
@@ -3404,7 +3450,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
unsigned int ee_len, depth, map_len = map->m_len;
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
- int allocated = 0;
unsigned int max_zeroout = 0;
ext_debug(inode, "logical block %llu, max_blocks %u\n",
@@ -3445,6 +3490,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - L2: we only attempt to merge with an extent stored in the
* same extent tree node.
*/
+ *allocated = 0;
if ((map->m_lblk == ee_block) &&
/* See if we can merge left */
(map_len < ee_len) && /*L1*/
@@ -3474,7 +3520,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3489,7 +3535,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
(map_len < ee_len) && /*L1*/
@@ -3520,7 +3566,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
trace_ext4_ext_convert_to_initialized_fastpath(inode,
map, ex, abut_ex);
@@ -3535,18 +3581,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
/* Result: number of initialized blocks past m_lblk */
- allocated = map_len;
+ *allocated = map_len;
}
}
- if (allocated) {
+ if (*allocated) {
/* Mark the block containing both extents as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
+ if (err)
+ goto errout;
goto out;
} else
- allocated = ee_len - (map->m_lblk - ee_block);
+ *allocated = ee_len - (map->m_lblk - ee_block);
WARN_ON(map->m_lblk < ee_block);
/*
@@ -3573,21 +3621,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (max_zeroout && (allocated > split_map.m_len)) {
- if (allocated <= max_zeroout) {
+ if (max_zeroout && (*allocated > split_map.m_len)) {
+ if (*allocated <= max_zeroout) {
/* case 3 or 5 */
zero_ex1.ee_block =
cpu_to_le32(split_map.m_lblk +
split_map.m_len);
zero_ex1.ee_len =
- cpu_to_le16(allocated - split_map.m_len);
+ cpu_to_le16(*allocated - split_map.m_len);
ext4_ext_store_pblock(&zero_ex1,
ext4_ext_pblock(ex) + split_map.m_lblk +
split_map.m_len - ee_block);
err = ext4_ext_zeroout(inode, &zero_ex1);
if (err)
goto fallback;
- split_map.m_len = allocated;
+ split_map.m_len = *allocated;
}
if (split_map.m_lblk - ee_block + split_map.m_len <
max_zeroout) {
@@ -3605,22 +3653,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_len += split_map.m_lblk - ee_block;
split_map.m_lblk = ee_block;
- allocated = map->m_len;
+ *allocated = map->m_len;
}
}
fallback:
- err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
- flags);
- if (err > 0)
- err = 0;
+ path = ext4_split_extent(handle, inode, path, &split_map, split_flag,
+ flags, NULL);
+ if (IS_ERR(path))
+ return path;
out:
/* If we have gotten a failure, don't zero out status tree */
- if (!err) {
- ext4_zeroout_es(inode, &zero_ex1);
- ext4_zeroout_es(inode, &zero_ex2);
- }
- return err ? err : allocated;
+ ext4_zeroout_es(inode, &zero_ex1);
+ ext4_zeroout_es(inode, &zero_ex2);
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -3645,15 +3695,16 @@ out:
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of unwritten extent to be written on success.
+ * The size of unwritten extent to be written is passed to the caller via the
+ * allocated pointer. Return an extent path pointer on success, or an error
+ * pointer on failure.
*/
-static int ext4_split_convert_extents(handle_t *handle,
+static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
- int flags)
+ struct ext4_ext_path *path,
+ int flags, unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
ext4_lblk_t eof_block;
ext4_lblk_t ee_block;
struct ext4_extent *ex;
@@ -3686,15 +3737,15 @@ static int ext4_split_convert_extents(handle_t *handle,
split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
}
flags |= EXT4_GET_BLOCKS_PRE_IO;
- return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags,
+ allocated);
}
-static int ext4_convert_unwritten_extents_endio(handle_t *handle,
- struct inode *inode,
- struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath)
+static struct ext4_ext_path *
+ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3722,20 +3773,21 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- goto out;
+ goto errout;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
@@ -3746,18 +3798,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-out:
+ if (err)
+ goto errout;
+
ext4_ext_show_leaf(inode, path);
- return err;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
+ struct ext4_ext_path *path,
unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3780,25 +3837,27 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
- err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
- if (err < 0)
- return err;
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
+
+ path = ext4_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- return err;
+ goto errout;
/* first mark the extent as unwritten */
ext4_ext_mark_unwritten(ex);
@@ -3810,7 +3869,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
- return err;
+ goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3819,22 +3878,24 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
- return 0;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ struct ext4_ext_path *path, int flags,
+ unsigned int *allocated, ext4_fsblk_t newblock)
{
- struct ext4_ext_path __maybe_unused *path = *ppath;
- int ret = 0;
int err = 0;
ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
(unsigned long long)map->m_lblk, map->m_len, flags,
- allocated);
+ *allocated);
ext4_ext_show_leaf(inode, path);
/*
@@ -3844,36 +3905,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
- allocated, newblock);
+ *allocated, newblock);
/* get_block() before submitting IO, split the extent */
if (flags & EXT4_GET_BLOCKS_PRE_IO) {
- ret = ext4_split_convert_extents(handle, inode, map, ppath,
- flags | EXT4_GET_BLOCKS_CONVERT);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ path = ext4_split_convert_extents(handle, inode, map, path,
+ flags | EXT4_GET_BLOCKS_CONVERT, allocated);
+ if (IS_ERR(path))
+ return path;
/*
- * shouldn't get a 0 return when splitting an extent unless
+ * shouldn't get a 0 allocated when splitting an extent unless
* m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
+ if (unlikely(*allocated == 0)) {
EXT4_ERROR_INODE(inode,
- "unexpected ret == 0, m_len = %u",
+ "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
- err = ext4_convert_unwritten_extents_endio(handle, inode, map,
- ppath);
- if (err < 0)
- goto out2;
+ path = ext4_convert_unwritten_extents_endio(handle, inode,
+ map, path);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
goto map_out;
}
@@ -3905,36 +3964,37 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
* For buffered writes, at writepage time, etc. Convert a
* discovered unwritten extent to written.
*/
- ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
- if (ret < 0) {
- err = ret;
- goto out2;
- }
+ path = ext4_ext_convert_to_initialized(handle, inode, map, path,
+ flags, allocated);
+ if (IS_ERR(path))
+ return path;
ext4_update_inode_fsync_trans(handle, inode, 1);
/*
- * shouldn't get a 0 return when converting an unwritten extent
+ * shouldn't get a 0 allocated when converting an unwritten extent
* unless m_len is 0 (bug) or extent has been corrupted
*/
- if (unlikely(ret == 0)) {
- EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
+ if (unlikely(*allocated == 0)) {
+ EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u",
map->m_len);
err = -EFSCORRUPTED;
- goto out2;
+ goto errout;
}
out:
- allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
out1:
map->m_pblk = newblock;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
ext4_ext_show_leaf(inode, path);
-out2:
- return err ? err : allocated;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
/*
@@ -4097,7 +4157,8 @@ again:
insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
- ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
+ ext4_es_insert_extent(inode, hole_start, len, ~0,
+ EXTENT_STATUS_HOLE, 0);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4131,7 +4192,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0, pblk;
- int err = 0, depth, ret;
+ int err = 0, depth;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -4144,7 +4205,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -4193,8 +4253,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- err = convert_initialized_extent(handle,
- inode, map, &path, &allocated);
+ path = convert_initialized_extent(handle,
+ inode, map, path, &allocated);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
} else if (!ext4_ext_is_unwritten(ex)) {
map->m_flags |= EXT4_MAP_MAPPED;
@@ -4206,13 +4268,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out;
}
- ret = ext4_ext_handle_unwritten_extents(
- handle, inode, map, &path, flags,
- allocated, newblock);
- if (ret < 0)
- err = ret;
- else
- allocated = ret;
+ path = ext4_ext_handle_unwritten_extents(
+ handle, inode, map, path, flags,
+ &allocated, newblock);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
}
}
@@ -4264,6 +4324,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
+ err = 0;
goto got_allocated_blocks;
}
@@ -4336,8 +4397,9 @@ got_allocated_blocks:
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
- if (err) {
+ path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
if (allocated_clusters) {
int fb_flags = 0;
@@ -4357,43 +4419,6 @@ got_allocated_blocks:
}
/*
- * Reduce the reserved cluster count to reflect successful deferred
- * allocation of delayed allocated clusters or direct allocation of
- * clusters discovered to be delayed allocated. Once allocated, a
- * cluster is not included in the reserved count.
- */
- if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /*
- * When allocating delayed allocated clusters, simply
- * reduce the reserved cluster count and claim quota
- */
- ext4_da_update_reserve_space(inode, allocated_clusters,
- 1);
- } else {
- ext4_lblk_t lblk, len;
- unsigned int n;
-
- /*
- * When allocating non-delayed allocated clusters
- * (from fallocate, filemap, DIO, or clusters
- * allocated when delalloc has been disabled by
- * ext4_nonda_switch), reduce the reserved cluster
- * count by the number of allocated clusters that
- * have previously been delayed allocated. Quota
- * has been claimed by ext4_mb_new_blocks() above,
- * so release the quota reservations made for any
- * previously delayed allocated clusters.
- */
- lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
- len = allocated_clusters << sbi->s_cluster_bits;
- n = ext4_es_delayed_clu(inode, lblk, len);
- if (n > 0)
- ext4_da_update_reserve_space(inode, (int) n, 0);
- }
- }
-
- /*
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an unwritten extent.
*/
@@ -5184,7 +5209,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* won't be shifted beyond EXT_MAX_BLOCKS.
*/
if (SHIFT == SHIFT_LEFT) {
- path = ext4_find_extent(inode, start - 1, &path,
+ path = ext4_find_extent(inode, start - 1, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5233,7 +5258,7 @@ again:
* becomes NULL to indicate the end of the loop.
*/
while (iterator && start <= stop) {
- path = ext4_find_extent(inode, *iterator, &path,
+ path = ext4_find_extent(inode, *iterator, path,
EXT4_EX_NOCACHE);
if (IS_ERR(path))
return PTR_ERR(path);
@@ -5535,6 +5560,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
+ ret = PTR_ERR(path);
goto out_stop;
}
@@ -5553,22 +5579,21 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
- ret = ext4_split_extent_at(handle, inode, &path,
+ path = ext4_split_extent_at(handle, inode, path,
offset_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
}
- ext4_free_ext_path(path);
- if (ret < 0) {
+ if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
+ ret = PTR_ERR(path);
goto out_stop;
}
- } else {
- ext4_free_ext_path(path);
}
+ ext4_free_ext_path(path);
ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
/*
@@ -5636,25 +5661,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int e1_len, e2_len, len;
int split = 0;
- path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+ path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE);
if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
- path1 = NULL;
- finish:
- count = 0;
- goto repeat;
+ goto errout;
}
- path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+ path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE);
if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
- path2 = NULL;
- goto finish;
+ goto errout;
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
/* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
- goto finish;
+ goto errout;
e1_blk = le32_to_cpu(ex1->ee_block);
e2_blk = le32_to_cpu(ex2->ee_block);
@@ -5676,7 +5697,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
next2 = e2_blk;
/* Do we have something to swap */
if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
- goto finish;
+ goto errout;
/* Move to the rightest boundary */
len = next1 - lblk1;
if (len < next2 - lblk2)
@@ -5686,28 +5707,32 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
lblk1 += len;
lblk2 += len;
count -= len;
- goto repeat;
+ continue;
}
/* Prepare left boundary */
if (e1_blk < lblk1) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (e2_blk < lblk2) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2, 0);
- if (unlikely(*erp))
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
/* Prepare right boundary */
len = count;
@@ -5718,30 +5743,34 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
if (len != e1_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1 + len, 0);
- if (unlikely(*erp))
- goto finish;
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1 + len, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
+ goto errout;
+ }
}
if (len != e2_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2 + len, 0);
- if (*erp)
- goto finish;
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2 + len, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
+ goto errout;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
if (split)
- goto repeat;
+ continue;
BUG_ON(e2_len != e1_len);
*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
/* Both extents are fully inside boundaries. Swap it now */
tmp_ex = *ex1;
@@ -5759,7 +5788,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
*erp = ext4_ext_dirty(handle, inode2, path2 +
path2->p_depth);
if (unlikely(*erp))
- goto finish;
+ goto errout;
*erp = ext4_ext_dirty(handle, inode1, path1 +
path1->p_depth);
/*
@@ -5769,17 +5798,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
* aborted anyway.
*/
if (unlikely(*erp))
- goto finish;
+ goto errout;
+
lblk1 += len;
lblk2 += len;
replaced_count += len;
count -= len;
-
- repeat:
- ext4_free_ext_path(path1);
- ext4_free_ext_path(path2);
- path1 = path2 = NULL;
}
+
+errout:
+ ext4_free_ext_path(path1);
+ ext4_free_ext_path(path2);
return replaced_count;
}
@@ -5814,11 +5843,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- path = NULL;
- goto out;
- }
+ if (IS_ERR(path))
+ return PTR_ERR(path);
depth = ext_depth(inode);
@@ -5880,7 +5906,7 @@ out:
int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
int len, int unwritten, ext4_fsblk_t pblk)
{
- struct ext4_ext_path *path = NULL, *ppath;
+ struct ext4_ext_path *path;
struct ext4_extent *ex;
int ret;
@@ -5896,30 +5922,34 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
if (le32_to_cpu(ex->ee_block) != start ||
ext4_ext_get_actual_len(ex) != len) {
/* We need to split this extent to match our extent first */
- ppath = path;
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path, start, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
- kfree(path);
- path = ext4_find_extent(inode, start, NULL, 0);
+ }
+
+ path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
- return -1;
- ppath = path;
+ return PTR_ERR(path);
+
ex = path[path->p_depth].p_ext;
WARN_ON(le32_to_cpu(ex->ee_block) != start);
+
if (ext4_ext_get_actual_len(ex) != len) {
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &ppath,
- start + len, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path,
+ start + len, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
- kfree(path);
- path = ext4_find_extent(inode, start, NULL, 0);
+ }
+
+ path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
- return -EINVAL;
+ return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
}
}
@@ -6001,12 +6031,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
+ if (!ex)
goto out;
- }
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_free_ext_path(path);
/* Count the number of data blocks */
cur = 0;
@@ -6032,32 +6059,28 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
ret = skip_hole(inode, &cur);
if (ret < 0)
goto out;
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
goto out;
numblks += path->p_depth;
- ext4_free_ext_path(path);
while (cur < end) {
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
break;
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
- return 0;
- }
+ if (!ex)
+ goto cleanup;
+
cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
ext4_ext_get_actual_len(ex));
ret = skip_hole(inode, &cur);
- if (ret < 0) {
- ext4_free_ext_path(path);
+ if (ret < 0)
break;
- }
- path2 = ext4_find_extent(inode, cur, NULL, 0);
- if (IS_ERR(path2)) {
- ext4_free_ext_path(path);
+
+ path2 = ext4_find_extent(inode, cur, path2, 0);
+ if (IS_ERR(path2))
break;
- }
+
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
cmp1 = cmp2 = 0;
if (i <= path->p_depth)
@@ -6069,13 +6092,14 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
if (cmp1 != cmp2 && cmp2 != 0)
numblks++;
}
- ext4_free_ext_path(path);
- ext4_free_ext_path(path2);
}
out:
inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
ext4_mark_inode_dirty(NULL, inode);
+cleanup:
+ ext4_free_ext_path(path);
+ ext4_free_ext_path(path2);
return 0;
}
@@ -6096,12 +6120,9 @@ int ext4_ext_clear_bb(struct inode *inode)
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
- if (!ex) {
- ext4_free_ext_path(path);
- return 0;
- }
+ if (!ex)
+ goto out;
end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
- ext4_free_ext_path(path);
cur = 0;
while (cur < end) {
@@ -6111,16 +6132,16 @@ int ext4_ext_clear_bb(struct inode *inode)
if (ret < 0)
break;
if (ret > 0) {
- path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
- if (!IS_ERR_OR_NULL(path)) {
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
+ if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++) {
-
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
0, path[j].p_block, 1, 1);
}
- ext4_free_ext_path(path);
+ } else {
+ path = NULL;
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
ext4_fc_record_regions(inode->i_sb, inode->i_ino,
@@ -6129,5 +6150,7 @@ int ext4_ext_clear_bb(struct inode *inode)
cur = cur + map.m_len;
}
+out:
+ ext4_free_ext_path(path);
return 0;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 17dcf13adde2..c786691dabd3 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -558,8 +558,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1))
return 1;
- /* we need to check delayed extent is without unwritten status */
- if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ /* we need to check delayed extent */
+ if (ext4_es_is_delayed(es1))
return 1;
return 0;
@@ -848,11 +848,12 @@ out:
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+ unsigned int status, int flags)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
+ int resv_used = 0, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
@@ -862,21 +863,14 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, flags, inode->i_ino);
if (!len)
return;
BUG_ON(end < lblk);
-
- if ((status & EXTENT_STATUS_DELAYED) &&
- (status & EXTENT_STATUS_WRITTEN)) {
- ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
- " delayed and written which can potentially "
- " cause data loss.", lblk, len);
- WARN_ON(1);
- }
+ WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
newes.es_lblk = lblk;
newes.es_len = len;
@@ -894,11 +888,11 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && revise_pending && !pr)
+ if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -922,16 +916,38 @@ retry:
if (revise_pending) {
err3 = __revise_pending(inode, lblk, len, &pr);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr) {
__free_pending(pr);
pr = NULL;
}
+ pending = err3;
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ /*
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
+ *
+ * When direct allocating (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by ext4_nonda_switch())
+ * an extent either 1) contains delayed blocks but start with
+ * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
+ * allocated blocks which belong to delayed allocated clusters when
+ * bigalloc feature is enabled, quota has already been claimed by
+ * ext4_mb_new_blocks(), so release the quota reservations made for
+ * any previously delayed allocated clusters instead of claim them
+ * again.
+ */
+ resv_used += pending;
+ if (resv_used)
+ ext4_da_update_reserve_space(inode, resv_used,
+ flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -1051,7 +1067,7 @@ out:
}
struct rsvd_count {
- int ndelonly;
+ int ndelayed;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -1077,10 +1093,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly = 0;
+ rc->ndelayed = 0;
/*
- * for bigalloc, note the first delonly block in the range has not
+ * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial
* cluster to track
@@ -1100,9 +1116,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * count_rsvd - count the clusters containing delayed and not unwritten
- * (delonly) blocks in a range within an extent and add to
- * the running tally in rsvd_count
+ * count_rsvd - count the clusters containing delayed blocks in a range
+ * within an extent and add to the running tally in rsvd_count
*
* @inode - file containing extent
* @lblk - first block in range
@@ -1119,13 +1134,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
- if (!ext4_es_is_delonly(es))
+ if (!ext4_es_is_delayed(es))
return;
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly += (int) len;
+ rc->ndelayed += (int) len;
return;
}
@@ -1135,7 +1150,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
- /* record the first block of the first delonly extent seen */
+ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
@@ -1149,7 +1164,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
}
@@ -1159,7 +1174,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1167,11 +1182,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/*
* if the current cluster starts on a cluster boundary, count the
- * number of whole delonly clusters in the extent
+ * number of whole delayed clusters in the extent
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly += nclu;
+ rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1231,10 +1246,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data
*
* The number of reservations to be released is equal to the number of
- * clusters containing delayed and not unwritten (delonly) blocks within
- * the range, minus the number of clusters still containing delonly blocks
- * at the ends of the range, and minus the number of pending reservations
- * within the range.
+ * clusters containing delayed blocks within the range, minus the number of
+ * clusters still containing delayed blocks at the ends of the range, and
+ * minus the number of pending reservations within the range.
*/
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es,
@@ -1245,33 +1259,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
- bool left_delonly, right_delonly, count_pending;
+ bool left_delayed, right_delayed, count_pending;
struct extent_status *es;
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly++;
+ rc->ndelayed++;
- if (rc->ndelonly == 0)
+ if (rc->ndelayed == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/*
- * decrease the delonly count by the number of clusters at the
- * ends of the range that still contain delonly blocks -
+ * decrease the delayed count by the number of clusters at the
+ * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved
*/
- left_delonly = right_delonly = false;
+ left_delayed = right_delayed = false;
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- left_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ left_delayed = true;
break;
}
node = rb_prev(&es->rb_node);
@@ -1279,7 +1293,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break;
es = rb_entry(node, struct extent_status, rb_node);
}
- if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
@@ -1289,9 +1303,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- right_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ right_delayed = true;
break;
}
node = rb_next(&es->rb_node);
@@ -1305,21 +1319,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the
- * original removed range containing delonly blocks are
+ * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation
* should be released with the information available in the
* extents status tree.
*/
if (first_lclu == last_lclu) {
- if (left_delonly | right_delonly)
+ if (left_delayed | right_delayed)
count_pending = false;
else
count_pending = true;
} else {
- if (left_delonly)
+ if (left_delayed)
first_lclu++;
- if (right_delonly)
+ if (right_delayed)
last_lclu--;
if (first_lclu <= last_lclu)
count_pending = true;
@@ -1330,13 +1344,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one
- * delonly block, so the delonly total must be reduced by one
+ * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released
*/
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly--;
+ rc->ndelayed--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr);
@@ -1347,7 +1361,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly;
+ return rc->ndelayed;
}
@@ -1940,7 +1954,7 @@ static struct pending_reservation *__get_pending(struct inode *inode,
* @lblk - logical block in the cluster to be added
* @prealloc - preallocated pending entry
*
- * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
@@ -1984,6 +1998,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
+ ret = 1;
out:
return ret;
@@ -2105,7 +2120,7 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if (err1 || err2 || err3) {
+ if (err1 || err2 || err3 < 0) {
if (lclu_allocated && !pr1)
pr1 = __alloc_pending(true);
if (end_allocated && !pr2)
@@ -2135,7 +2150,7 @@ retry:
if (lclu_allocated) {
err3 = __insert_pending(inode, lblk, &pr1);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr1) {
__free_pending(pr1);
@@ -2144,7 +2159,7 @@ retry:
}
if (end_allocated) {
err3 = __insert_pending(inode, end, &pr2);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr2) {
__free_pending(pr2);
@@ -2153,7 +2168,7 @@ retry:
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -2162,94 +2177,6 @@ error:
}
/*
- * __es_delayed_clu - count number of clusters containing blocks that
- * are delayed only
- *
- * @inode - file containing block range
- * @start - logical block defining start of range
- * @end - logical block defining end of range
- *
- * Returns the number of clusters containing only delayed (not delayed
- * and unwritten) blocks in the range specified by @start and @end. Any
- * cluster or part of a cluster within the range and containing a delayed
- * and not unwritten block within the range is counted as a whole cluster.
- */
-static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
-{
- struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
- struct extent_status *es;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct rb_node *node;
- ext4_lblk_t first_lclu, last_lclu;
- unsigned long long last_counted_lclu;
- unsigned int n = 0;
-
- /* guaranteed to be unequal to any ext4_lblk_t value */
- last_counted_lclu = ~0ULL;
-
- es = __es_tree_search(&tree->root, start);
-
- while (es && (es->es_lblk <= end)) {
- if (ext4_es_is_delonly(es)) {
- if (es->es_lblk <= start)
- first_lclu = EXT4_B2C(sbi, start);
- else
- first_lclu = EXT4_B2C(sbi, es->es_lblk);
-
- if (ext4_es_end(es) >= end)
- last_lclu = EXT4_B2C(sbi, end);
- else
- last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
-
- if (first_lclu == last_counted_lclu)
- n += last_lclu - first_lclu;
- else
- n += last_lclu - first_lclu + 1;
- last_counted_lclu = last_lclu;
- }
- node = rb_next(&es->rb_node);
- if (!node)
- break;
- es = rb_entry(node, struct extent_status, rb_node);
- }
-
- return n;
-}
-
-/*
- * ext4_es_delayed_clu - count number of clusters containing blocks that
- * are both delayed and unwritten
- *
- * @inode - file containing block range
- * @lblk - logical block defining start of range
- * @len - number of blocks in range
- *
- * Locking for external use of __es_delayed_clu().
- */
-unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_lblk_t end;
- unsigned int n;
-
- if (len == 0)
- return 0;
-
- end = lblk + len - 1;
- WARN_ON(end < lblk);
-
- read_lock(&ei->i_es_lock);
-
- n = __es_delayed_clu(inode, lblk, end);
-
- read_unlock(&ei->i_es_lock);
-
- return n;
-}
-
-/*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending
* upon the presence or absence of delayed blocks
@@ -2263,7 +2190,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
- * status. Must be called while holding i_es_lock.
+ * status. Must be called while holding i_es_lock. Returns number of new
+ * inserts pending cluster on insert pendings, returns 0 on remove pendings,
+ * return -ENOMEM on failure.
*/
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len,
@@ -2273,6 +2202,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int pendings = 0;
int ret = 0;
if (len == 0)
@@ -2294,49 +2224,53 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
- &ext4_es_is_delonly,
+ &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
- l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
out:
- return ret;
+ return (ret < 0) ? ret : pendings;
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 3c8e2edee5d5..4424232de298 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -42,6 +42,10 @@ enum {
#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
+/*
+ * Besides EXTENT_STATUS_REFERENCED, all these extent type masks
+ * are exclusive, only one type can be set at a time.
+ */
#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
@@ -51,7 +55,9 @@ enum {
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
- EXTENT_STATUS_HOLE) << ES_SHIFT)
+ EXTENT_STATUS_HOLE))
+
+#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1)))
struct ext4_sb_info;
struct ext4_extent;
@@ -129,7 +135,7 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status);
+ unsigned int status, int flags);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
@@ -156,7 +162,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
static inline unsigned int ext4_es_type(struct extent_status *es)
{
- return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
+ return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
}
static inline int ext4_es_is_written(struct extent_status *es)
@@ -184,11 +190,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
-static inline int ext4_es_is_delonly(struct extent_status *es)
-{
- return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
-}
-
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -224,17 +225,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
es->es_pblk = block;
}
-static inline void ext4_es_store_status(struct extent_status *es,
- unsigned int status)
-{
- es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
- (es->es_pblk & ~ES_MASK);
-}
-
static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb,
unsigned int status)
{
+ WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));
+
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(pb & ~ES_MASK);
}
@@ -252,8 +248,6 @@ extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated,
bool end_allocated);
-extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 3926a05eceee..eaa5f5b51f50 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -339,22 +339,29 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid;
+ bool has_transaction = true;
+ bool is_ineligible;
if (ext4_fc_disabled(sb))
return;
- ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (handle && !IS_ERR(handle))
tid = handle->h_transaction->t_tid;
else {
read_lock(&sbi->s_journal->j_state_lock);
- tid = sbi->s_journal->j_running_transaction ?
- sbi->s_journal->j_running_transaction->t_tid : 0;
+ if (sbi->s_journal->j_running_transaction)
+ tid = sbi->s_journal->j_running_transaction->t_tid;
+ else
+ has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
spin_lock(&sbi->s_fc_lock);
- if (tid_gt(tid, sbi->s_fc_ineligible_tid))
+ is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (has_transaction &&
+ (!is_ineligible ||
+ (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
sbi->s_fc_ineligible_tid = tid;
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
@@ -1288,8 +1295,21 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, iter->i_sync_tid))
+ if (tid_geq(tid, iter->i_sync_tid)) {
ext4_fc_reset_inode(&iter->vfs_inode);
+ } else if (full) {
+ /*
+ * We are called after a full commit, inode has been
+ * modified while the commit was running. Re-enqueue
+ * the inode into STAGING, which will then be splice
+ * back into MAIN. This cannot happen during
+ * fastcommit because the journal is locked all the
+ * time in that case (and tid doesn't increase so
+ * tid check above isn't reliable).
+ */
+ list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+ &sbi->s_fc_q[FC_Q_STAGING]);
+ }
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
#if (BITS_PER_LONG < 64)
@@ -1772,7 +1792,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ret == 0) {
/* Range is not mapped */
- path = ext4_find_extent(inode, cur, NULL, 0);
+ path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path))
goto out;
memset(&newex, 0, sizeof(newex));
@@ -1783,11 +1803,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ext4_ext_is_unwritten(ex))
ext4_ext_mark_unwritten(&newex);
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_ext_insert_extent(
- NULL, inode, &path, &newex, 0);
+ path = ext4_ext_insert_extent(NULL, inode,
+ path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem));
- ext4_free_ext_path(path);
- if (ret)
+ if (IS_ERR(path))
goto out;
goto next;
}
@@ -1836,6 +1855,7 @@ next:
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits);
out:
+ ext4_free_ext_path(path);
iput(inode);
return 0;
}
@@ -1936,12 +1956,13 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
break;
if (ret > 0) {
- path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map.m_lblk, path, 0);
if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, true);
- ext4_free_ext_path(path);
+ } else {
+ path = NULL;
}
cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
@@ -1952,6 +1973,8 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
}
iput(inode);
}
+
+ ext4_free_ext_path(path);
}
/*
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c89e434db6b7..f14aed14b9cf 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,7 +306,7 @@ out:
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t count)
+ ssize_t written, ssize_t count)
{
handle_t *handle;
@@ -315,7 +315,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + count)) {
+ if (ext4_update_inode_size(inode, offset + written)) {
int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
ext4_journal_stop(handle);
@@ -323,21 +323,21 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
}
}
- if (inode->i_nlink)
+ if ((written == count) && inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- return count;
+ return written;
}
/*
* Clean up the inode after DIO or DAX extending write has completed and the
* inode size has been updated using ext4_handle_inode_extension().
*/
-static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
{
lockdep_assert_held_write(&inode->i_rwsem);
- if (count < 0) {
+ if (need_trunc) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -393,7 +393,7 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode))
return size;
- return ext4_handle_inode_extension(inode, pos, size);
+ return ext4_handle_inode_extension(inode, pos, size, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -586,7 +586,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
* writeback of delalloc blocks.
*/
WARN_ON_ONCE(ret == -EIOCBQUEUED);
- ext4_inode_extension_cleanup(inode, ret);
+ ext4_inode_extension_cleanup(inode, ret < 0);
}
out:
@@ -669,8 +669,8 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
if (extend) {
- ret = ext4_handle_inode_extension(inode, offset, ret);
- ext4_inode_extension_cleanup(inode, ret);
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
}
out:
inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9dfd768ed9f8..7f1a5f90dbbd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return 0;
- grp = ext4_get_group_info(sb, block_group);
-
if (buffer_verified(bh))
return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
return -EFSCORRUPTED;
@@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (buffer_verified(bh))
goto verified;
blk = ext4_inode_bitmap(sb, desc);
- if (!ext4_inode_bitmap_csum_verify(sb, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8) ||
+ if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
@@ -327,8 +326,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (percpu_counter_initialized(&sbi->s_dirs_counter))
percpu_counter_dec(&sbi->s_dirs_counter);
}
- ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
@@ -514,6 +512,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (min_inodes < 1)
min_inodes = 1;
min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+ if (min_clusters < 0)
+ min_clusters = 0;
/*
* Start looking in the flex group where we last allocated an
@@ -755,10 +755,10 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
struct ext4_group_desc *gdp;
ext4_group_t group;
int bit;
- int err = -EFSCORRUPTED;
+ int err;
if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
- goto out;
+ return -EFSCORRUPTED;
group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
@@ -772,7 +772,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
}
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
- if (!gdp || !group_desc_bh) {
+ if (!gdp) {
err = -EINVAL;
goto out;
}
@@ -851,8 +851,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
@@ -860,6 +859,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
sync_dirty_buffer(group_desc_bh);
out:
+ brelse(inode_bitmap_bh);
return err;
}
@@ -1053,14 +1053,14 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
- && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
- IS_ERR(inode_bitmap_bh)) {
+ if (IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
}
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
+ EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
-repeat_in_this_group:
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2)
goto next_group;
@@ -1110,8 +1110,6 @@ repeat_in_this_group:
if (!ret2)
goto got; /* we grabbed the inode! */
- if (ino < EXT4_INODES_PER_GROUP(sb))
- goto repeat_in_this_group;
next_group:
if (++group == ngroups)
group = 0;
@@ -1224,8 +1222,7 @@ got:
}
}
if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index d8ca7f64f952..7404f0935c90 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
- /*
- * Update reserved blocks/metadata blocks after successful block
- * allocation which had been deferred till now.
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, count, 1);
-
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index edf4aa99a974..3536ca7e4fcc 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -601,10 +601,11 @@ retry:
goto out;
if (ext4_should_dioread_nolock(inode)) {
- ret = __block_write_begin(folio, from, to,
- ext4_get_block_unwritten);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block_unwritten);
} else
- ret = __block_write_begin(folio, from, to, ext4_get_block);
+ ret = ext4_block_write_begin(handle, folio, from, to,
+ ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -856,8 +857,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
goto out;
}
- ret = __block_write_begin(folio, 0, inline_size,
- ext4_da_get_block_prep);
+ ret = ext4_block_write_begin(NULL, folio, 0, inline_size,
+ ext4_da_get_block_prep);
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
folio_unlock(folio);
@@ -1665,24 +1666,36 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data)
{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
int ret;
- struct ext4_iloc iloc;
void *inline_start;
int inline_size;
- if (ext4_get_inode_loc(dir, &iloc))
- return NULL;
+ ret = ext4_get_inode_loc(dir, &is.iloc);
+ if (ret)
+ return ERR_PTR(ret);
down_read(&EXT4_I(dir)->xattr_sem);
+
+ ret = ext4_xattr_ibody_find(dir, &i, &is);
+ if (ret)
+ goto out;
+
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
goto out;
}
- inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block +
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
@@ -1692,20 +1705,23 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
goto out;
- inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc);
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
- ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir);
if (ret == 1)
goto out_find;
out:
- brelse(iloc.bh);
- iloc.bh = NULL;
+ brelse(is.iloc.bh);
+ if (ret < 0)
+ is.iloc.bh = ERR_PTR(ret);
+ else
+ is.iloc.bh = NULL;
out_find:
up_read(&EXT4_I(dir)->xattr_sem);
- return iloc.bh;
+ return is.iloc.bh;
}
int ext4_delete_inline_entry(handle_t *handle,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03374dc215d1..54bdd4884fe6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -49,6 +49,11 @@
#include <trace/events/ext4.h>
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+ struct inode *inode,
+ struct folio *folio,
+ unsigned from, unsigned to);
+
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
@@ -478,7 +483,89 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
+ map->m_pblk, status, 0);
+ return retval;
+}
+
+static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct extent_status es;
+ unsigned int status;
+ int err, retval = 0;
+
+ /*
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
+ * indicates that the blocks and quotas has already been
+ * checked when the data was copied into the page cache.
+ */
+ if (map->m_flags & EXT4_MAP_DELAYED)
+ flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ /*
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * We need to check for EXT4 here because migrate could have
+ * changed the inode type in between.
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ /*
+ * We allocated new blocks which will result in i_data's
+ * format changing. Force the migrate to fail by clearing
+ * migrate flags.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode %lu: "
+ "retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed. We also have to
+ * unmap metadata before zeroing as otherwise writeback can
+ * overwrite zeros with stale data from block device.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
+ err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
+ map->m_len);
+ if (err)
+ return err;
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if (flags & EXT4_GET_BLOCKS_PRE_IO &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_is_written(&es))
+ return retval;
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, flags);
+
return retval;
}
@@ -576,32 +663,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
+ retval = ext4_map_query_blocks(handle, inode, map);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -631,88 +693,13 @@ found:
return retval;
/*
- * Here we clear m_flags because after allocating an new extent,
- * it will be set again.
- */
- map->m_flags &= ~EXT4_MAP_FLAGS;
-
- /*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_block()
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
-
- /*
- * We need to check for EXT4 here because migrate
- * could have changed the inode type in between
- */
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags);
-
- if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
- /*
- * We allocated new blocks which will result in
- * i_data's format changing. Force the migrate
- * to fail by clearing migrate flags
- */
- ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
- }
- }
-
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- /*
- * We have to zeroout blocks before inserting them into extent
- * status tree. Otherwise someone could look them up there and
- * use them before they are really zeroed. We also have to
- * unmap metadata before zeroing as otherwise writeback can
- * overwrite zeros with stale data from block device.
- */
- if (flags & EXT4_GET_BLOCKS_ZERO &&
- map->m_flags & EXT4_MAP_MAPPED &&
- map->m_flags & EXT4_MAP_NEW) {
- ret = ext4_issue_zeroout(inode, map->m_lblk,
- map->m_pblk, map->m_len);
- if (ret) {
- retval = ret;
- goto out_sem;
- }
- }
-
- /*
- * If the extent has been zeroed out, we don't need to update
- * extent status tree.
- */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- if (ext4_es_is_written(&es))
- goto out_sem;
- }
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
-
-out_sem:
+ retval = ext4_map_create_blocks(handle, inode, map, flags);
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
@@ -1018,32 +1005,16 @@ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
- int dirty = buffer_dirty(bh);
- int ret;
-
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
- /*
- * __block_write_begin() could have dirtied some buffers. Clean
- * the dirty bit as jbd2_journal_get_write_access() could complain
- * otherwise about fs integrity issues. Setting of the dirty bit
- * by __block_write_begin() isn't a real problem here as we clear
- * the bit before releasing a page lock and thus writeback cannot
- * ever write the buffer.
- */
- if (dirty)
- clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access");
- ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ return ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE);
- if (!ret && dirty)
- ret = ext4_dirty_journalled_data(handle, bh);
- return ret;
}
-#ifdef CONFIG_FS_ENCRYPTION
-static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
- get_block_t *get_block)
+int ext4_block_write_begin(handle_t *handle, struct folio *folio,
+ loff_t pos, unsigned len,
+ get_block_t *get_block)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
@@ -1056,6 +1027,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
struct buffer_head *bh, *head, *wait[2];
int nr_wait = 0;
int i;
+ bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
BUG_ON(from > PAGE_SIZE);
@@ -1085,10 +1057,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
+ /*
+ * We may be zeroing partial buffers or all new
+ * buffers in case of failure. Prepare JBD2 for
+ * that.
+ */
+ if (should_journal_data)
+ do_journal_get_write_access(handle,
+ inode, bh);
if (folio_test_uptodate(folio)) {
- clear_buffer_new(bh);
+ /*
+ * Unlike __block_write_begin() we leave
+ * dirtying of new uptodate buffers to
+ * ->write_end() time or
+ * folio_zero_new_buffers().
+ */
set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
@@ -1118,7 +1102,11 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
err = -EIO;
}
if (unlikely(err)) {
- folio_zero_new_buffers(folio, from, to);
+ if (should_journal_data)
+ ext4_journalled_zero_new_buffers(handle, inode, folio,
+ from, to);
+ else
+ folio_zero_new_buffers(folio, from, to);
} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -1134,7 +1122,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
return err;
}
-#endif
/*
* To preserve ordering, it is essential that the hole instantiation and
@@ -1216,19 +1203,12 @@ retry_journal:
/* In case writeback began while the folio was unlocked */
folio_wait_stable(folio);
-#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
- ret = ext4_block_write_begin(folio, pos, len,
+ ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block_unwritten);
else
- ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
-#else
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(folio, pos, len,
- ext4_get_block_unwritten);
- else
- ret = __block_write_begin(folio, pos, len, ext4_get_block);
-#endif
+ ret = ext4_block_write_begin(handle, folio, pos, len,
+ ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
folio_buffers(folio), from, to,
@@ -1241,7 +1221,7 @@ retry_journal:
folio_unlock(folio);
/*
- * __block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_rwsem.
*
@@ -1388,9 +1368,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
size = min(to, block_end) - start;
folio_zero_range(folio, start, size);
- write_end_fn(handle, inode, bh);
}
clear_buffer_new(bh);
+ write_end_fn(handle, inode, bh);
}
}
block_start = block_end;
@@ -1661,7 +1641,7 @@ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
int ret;
/* Has delalloc reservation? */
- if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
return 1;
/* Already been allocated? */
@@ -1782,7 +1762,7 @@ found:
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delonly(&es)) {
+ if (ext4_es_is_delayed(&es)) {
map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
@@ -2217,11 +2197,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
* possible.
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
- * the blocks in question are delalloc blocks. This indicates
- * that the blocks and quotas has already been checked when
- * the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
@@ -2229,8 +2204,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & BIT(BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
@@ -2959,11 +2932,8 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
-#ifdef CONFIG_FS_ENCRYPTION
- ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
-#else
- ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep);
-#endif
+ ret = ext4_block_write_begin(NULL, folio, pos, len,
+ ext4_da_get_block_prep);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
@@ -4067,7 +4037,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
stop_block);
ext4_es_insert_extent(inode, first_block, hole_len, ~0,
- EXTENT_STATUS_HOLE);
+ EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(handle, inode, first_block, stop_block);
@@ -5276,8 +5246,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- tid_t commit_tid = 0;
+ tid_t commit_tid;
int ret;
+ bool has_transaction;
offset = inode->i_size & (PAGE_SIZE - 1);
/*
@@ -5302,12 +5273,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
folio_put(folio);
if (ret != -EBUSY)
return;
- commit_tid = 0;
+ has_transaction = false;
read_lock(&journal->j_state_lock);
- if (journal->j_committing_transaction)
+ if (journal->j_committing_transaction) {
commit_tid = journal->j_committing_transaction->t_tid;
+ has_transaction = true;
+ }
read_unlock(&journal->j_state_lock);
- if (commit_tid)
+ if (has_transaction)
jbd2_log_wait_commit(journal, commit_tid);
}
}
@@ -6216,7 +6189,8 @@ retry_alloc:
if (folio_pos(folio) + len > size)
len = size - folio_pos(folio);
- err = __block_write_begin(folio, 0, len, ext4_get_block);
+ err = ext4_block_write_begin(handle, folio, 0, len,
+ ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
if (ext4_journal_folio_buffers(handle, folio, len))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e8bf5972dd47..1c77400bd88e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1343,10 +1343,10 @@ group_extend_out:
me.moved_len = 0;
donor = fdget(me.donor_fd);
- if (!donor.file)
+ if (!fd_file(donor))
return -EBADF;
- if (!(donor.file->f_mode & FMODE_WRITE)) {
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto mext_out;
}
@@ -1367,7 +1367,7 @@ group_extend_out:
if (err)
goto mext_out;
- err = ext4_move_extents(filp, donor.file, me.orig_start,
+ err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
mnt_drop_write_file(filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9dda9cd68ab2..d73e38323879 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2356,7 +2356,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len &&
- ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
+ ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start;
start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
@@ -2553,7 +2553,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- stripe = EXT4_B2C(sbi, sbi->s_stripe);
+ stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
@@ -2928,9 +2928,11 @@ repeat:
if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
else {
- bool is_stripe_aligned = sbi->s_stripe &&
+ bool is_stripe_aligned =
+ (sbi->s_stripe >=
+ sbi->s_cluster_ratio) &&
!(ac->ac_g_ex.fe_len %
- EXT4_B2C(sbi, sbi->s_stripe));
+ EXT4_NUM_B2C(sbi, sbi->s_stripe));
if ((cr == CR_GOAL_LEN_FAST ||
cr == CR_BEST_AVAIL_LEN) &&
@@ -3075,8 +3077,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
seq_puts(seq, " ]");
if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
seq_puts(seq, " Block bitmap corrupted!");
- seq_puts(seq, "\n");
-
+ seq_putc(seq, '\n');
return 0;
}
@@ -3707,7 +3708,7 @@ int ext4_mb_init(struct super_block *sb)
*/
if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup(
- sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
+ sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
}
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -3887,11 +3888,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/*
* Clear the trimmed flag for the group so that the next
* ext4_trim_fs can trim it.
- * If the volume is mounted with -o discard, online discard
- * is supported and the free blocks will be trimmed online.
*/
- if (!test_opt(sb, DISCARD))
- EXT4_MB_GRP_CLEAR_TRIMMED(db);
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
if (!db->bb_free_root.rb_node) {
/* No more items in the per group rb tree
@@ -6515,8 +6513,9 @@ do_more:
" group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
- } else
- EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+ }
+
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
mb_free_blocks(inode, &e4b, bit, count_clusters);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d98ac2af8199..1b0dfd963d3f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
- path = NULL;
goto err_out;
}
@@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
if (retval < 0)
goto err_out;
- retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
+ path = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ if (IS_ERR(path))
+ retval = PTR_ERR(path);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
ext4_free_ext_path(path);
@@ -663,8 +664,8 @@ int ext4_ind_migrate(struct inode *inode)
if (unlikely(ret2 && !ret))
ret = ret2;
errout:
- ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
out_unlock:
ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return ret;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 204f53b23622..b64661ea6e0e 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,27 +17,23 @@
* get_ext_path() - Find an extent path for designated logical block number.
* @inode: inode to be searched
* @lblock: logical block number to find an extent path
- * @ppath: pointer to an extent path pointer (for output)
+ * @path: pointer to an extent path
*
- * ext4_find_extent wrapper. Return 0 on success, or a negative error value
- * on failure.
+ * ext4_find_extent wrapper. Return an extent path pointer on success,
+ * or an error pointer on failure.
*/
-static inline int
+static inline struct ext4_ext_path *
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **ppath)
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path;
-
- path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
if (path[ext_depth(inode)].p_ext == NULL) {
ext4_free_ext_path(path);
- *ppath = NULL;
- return -ENODATA;
+ return ERR_PTR(-ENODATA);
}
- *ppath = path;
- return 0;
+ return path;
}
/**
@@ -95,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
int ret = 0;
ext4_lblk_t last = from + count;
while (from < last) {
- *err = get_ext_path(inode, from, &path);
- if (*err)
- goto out;
+ path = get_ext_path(inode, from, path);
+ if (IS_ERR(path)) {
+ *err = PTR_ERR(path);
+ return ret;
+ }
ext = path[ext_depth(inode)].p_ext;
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
@@ -166,15 +164,16 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
return 0;
}
-/* Force page buffers uptodate w/o dropping page's lock */
-static int
-mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
+/* Force folio buffers uptodate w/o dropping folio's lock */
+static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
{
struct inode *inode = folio->mapping->host;
sector_t block;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head;
unsigned int blocksize, block_start, block_end;
- int i, err, nr = 0, partial = 0;
+ int nr = 0;
+ bool partial = false;
+
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -186,19 +185,21 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
if (!head)
head = create_empty_buffers(folio, blocksize, 0);
- block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits);
- for (bh = head, block_start = 0; bh != head || !block_start;
- block++, block_start = block_end, bh = bh->b_this_page) {
+ block = folio_pos(folio) >> inode->i_blkbits;
+ block_end = 0;
+ bh = head;
+ do {
+ block_start = block_end;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
- partial = 1;
+ partial = true;
continue;
}
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
- err = ext4_get_block(inode, block, bh, 0);
+ int err = ext4_get_block(inode, block, bh, 0);
if (err)
return err;
if (!buffer_mapped(bh)) {
@@ -207,21 +208,30 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
continue;
}
}
- BUG_ON(nr >= MAX_BUF_PER_PAGE);
- arr[nr++] = bh;
- }
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+ ext4_read_bh_nowait(bh, 0, NULL);
+ nr++;
+ } while (block++, (bh = bh->b_this_page) != head);
+
/* No io required */
if (!nr)
goto out;
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (!bh_uptodate_or_lock(bh)) {
- err = ext4_read_bh(bh, 0, NULL);
- if (err)
- return err;
- }
- }
+ bh = head;
+ do {
+ if (bh_offset(bh) + blocksize <= from)
+ continue;
+ if (bh_offset(bh) > to)
+ break;
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ continue;
+ return -EIO;
+ } while ((bh = bh->b_this_page) != head);
out:
if (!partial)
folio_mark_uptodate(folio);
@@ -624,9 +634,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
int offset_in_page;
int unwritten, cur_len;
- ret = get_ext_path(orig_inode, o_start, &path);
- if (ret)
+ path = get_ext_path(orig_inode, o_start, path);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
ex = path[path->p_depth].p_ext;
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6a95713f9193..790db7eac6c2 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1482,7 +1482,7 @@ static bool ext4_match(struct inode *parent,
}
/*
- * Returns 0 if not found, -1 on failure, and 1 on success
+ * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success
*/
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct inode *dir, struct ext4_filename *fname,
@@ -1503,7 +1503,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
* a full check */
if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
buf_size, offset))
- return -1;
+ return -EFSCORRUPTED;
*res_dir = de;
return 1;
}
@@ -1511,7 +1511,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
if (de_len <= 0)
- return -1;
+ return -EFSCORRUPTED;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
@@ -1574,7 +1574,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
&has_inline_data);
if (inlined)
*inlined = has_inline_data;
- if (has_inline_data)
+ if (has_inline_data || IS_ERR(ret))
goto cleanup_and_exit;
}
@@ -1663,8 +1663,10 @@ restart:
goto cleanup_and_exit;
} else {
brelse(bh);
- if (i < 0)
+ if (i < 0) {
+ ret = ERR_PTR(i);
goto cleanup_and_exit;
+ }
}
next:
if (++block >= nblocks)
@@ -1758,7 +1760,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
if (retval == 1)
goto success;
brelse(bh);
- if (retval == -1) {
+ if (retval < 0) {
bh = ERR_PTR(ERR_BAD_DX_DIR);
goto errout;
}
@@ -1999,7 +2001,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
split = count/2;
hash2 = map[split].hash;
- continued = hash2 == map[split - 1].hash;
+ continued = split > 0 ? hash2 == map[split - 1].hash : 0;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
(unsigned long)dx_get_block(frame->at),
hash2, split, count-split));
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 8494492582ab..5d3a9dc9a32d 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -221,7 +221,7 @@ int ext4_mpage_readpages(struct inode *inode,
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev;
int length;
@@ -263,6 +263,7 @@ int ext4_mpage_readpages(struct inode *inode,
unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset;
+ first_block = map.m_pblk + map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
/* needed? */
@@ -271,8 +272,6 @@ int ext4_mpage_readpages(struct inode *inode,
}
if (page_block == blocks_per_page)
break;
- blocks[page_block] = map.m_pblk + map_offset +
- relative_block;
page_block++;
block_in_file++;
}
@@ -307,7 +306,9 @@ int ext4_mpage_readpages(struct inode *inode,
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != map.m_pblk-1)
+ if (!page_block)
+ first_block = map.m_pblk;
+ else if (first_block + page_block != map.m_pblk)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
@@ -316,7 +317,6 @@ int ext4_mpage_readpages(struct inode *inode,
break;
} else if (page_block == blocks_per_page)
break;
- blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
@@ -339,7 +339,7 @@ int ext4_mpage_readpages(struct inode *inode,
* This folio will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ if (bio && (last_block_in_bio != first_block - 1 ||
!fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
@@ -355,7 +355,7 @@ int ext4_mpage_readpages(struct inode *inode,
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, folio->index);
- bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_iter.bi_sector = first_block << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
if (rac)
bio->bi_opf |= REQ_RAHEAD;
@@ -371,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = blocks[blocks_per_page - 1];
+ last_block_in_bio = first_block + blocks_per_page - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0ba9837d65ca..e04eb08b9060 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1319,8 +1319,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
if (!bh)
return -EIO;
- ext4_inode_bitmap_csum_set(sb, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_inode_bitmap_csum_set(sb, gdp, bh);
brelse(bh);
bh = ext4_get_bitmap(sb, group_data->block_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e72145c4ae5a..16a4ce704460 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -735,11 +735,12 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
- * Make sure updated value of ->s_mount_flags will be visible before
- * ->s_flags update
+ * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
+ * modifications. We don't set SB_RDONLY because that requires
+ * sb->s_umount semaphore and setting it without proper remount
+ * procedure is confusing code such as freeze_super() leading to
+ * deadlocks and other problems.
*/
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
}
static void update_super_work(struct work_struct *work)
@@ -3045,7 +3046,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset)
seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
rc = _ext4_show_options(seq, sb, 1);
- seq_puts(seq, "\n");
+ seq_putc(seq, '\n');
return rc;
}
@@ -5087,16 +5088,27 @@ out:
return ret;
}
-static void ext4_hash_info_init(struct super_block *sb)
+static int ext4_hash_info_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
unsigned int i;
+ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_def_hash_version > DX_HASH_LAST) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid default hash set in the superblock");
+ return -EINVAL;
+ } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
+ ext4_msg(sb, KERN_ERR,
+ "SIPHASH is not a valid default hash value");
+ return -EINVAL;
+ }
+
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
- sbi->s_def_hash_version = es->s_def_hash_version;
if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
@@ -5114,6 +5126,7 @@ static void ext4_hash_info_init(struct super_block *sb)
#endif
}
}
+ return 0;
}
static int ext4_block_group_meta_init(struct super_block *sb, int silent)
@@ -5165,6 +5178,18 @@ static int ext4_block_group_meta_init(struct super_block *sb, int silent)
return 0;
}
+/*
+ * It's hard to get stripe aligned blocks if stripe is not aligned with
+ * cluster, just disable stripe and alert user to simplify code and avoid
+ * stripe aligned allocation which will rarely succeed.
+ */
+static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ return (stripe > 0 && sbi->s_cluster_ratio > 1 &&
+ stripe % sbi->s_cluster_ratio != 0);
+}
+
static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
struct ext4_super_block *es = NULL;
@@ -5249,7 +5274,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err)
goto failed_mount;
- ext4_hash_info_init(sb);
+ err = ext4_hash_info_init(sb);
+ if (err)
+ goto failed_mount;
err = ext4_handle_clustersize(sb);
if (err)
@@ -5272,13 +5299,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi);
- /*
- * It's hard to get stripe aligned blocks if stripe is not aligned with
- * cluster, just disable stripe and alert user to simpfy code and avoid
- * stripe aligned allocation which will rarely successes.
- */
- if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
- sbi->s_stripe % sbi->s_cluster_ratio != 0) {
+ if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) {
ext4_msg(sb, KERN_WARNING,
"stripe (%lu) is not aligned with cluster size (%u), "
"stripe is disabled",
@@ -5313,6 +5334,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -5534,7 +5557,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* Save the original bdev mapping's wb_err value which could be
* used to detect the metadata async write error.
*/
- spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
&sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
@@ -5614,8 +5636,8 @@ failed_mount3a:
failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
- del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
+ del_timer_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
if (sbi->s_chksum_driver)
@@ -6441,6 +6463,15 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
+ if ((ctx->spec & EXT4_SPEC_s_stripe) &&
+ ext4_is_stripe_incompatible(sb, ctx->s_stripe)) {
+ ext4_msg(sb, KERN_WARNING,
+ "stripe (%lu) is not aligned with cluster size (%u), "
+ "stripe is disabled",
+ ctx->s_stripe, sbi->s_cluster_ratio);
+ ctx->s_stripe = 0;
+ }
+
/*
* Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
* two calls to ext4_should_dioread_nolock() to return inconsistent
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 46ce2f21fef9..e0e1956dcdd3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -458,7 +458,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
ext4_xattr_inode_set_ref(inode, 1);
} else {
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_XATTR);
inode->i_flags |= S_NOQUOTA;
inode_unlock(inode);
}
@@ -1039,7 +1039,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
s64 ref_count;
int ret;
- inode_lock(ea_inode);
+ inode_lock_nested(ea_inode, I_MUTEX_XATTR);
ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
if (ret)
@@ -2879,33 +2879,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
if (*ea_inode_array == NULL) {
/*
* Start with 15 inodes, so it fits into a power-of-two size.
- * If *ea_inode_array is NULL, this is essentially offsetof()
*/
- (*ea_inode_array) =
- kmalloc(offsetof(struct ext4_xattr_inode_array,
- inodes[EIA_MASK]),
- GFP_NOFS);
+ (*ea_inode_array) = kmalloc(
+ struct_size(*ea_inode_array, inodes, EIA_MASK),
+ GFP_NOFS);
if (*ea_inode_array == NULL)
return -ENOMEM;
(*ea_inode_array)->count = 0;
} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
/* expand the array once all 15 + n * 16 slots are full */
struct ext4_xattr_inode_array *new_array = NULL;
- int count = (*ea_inode_array)->count;
- /* if new_array is NULL, this is essentially offsetof() */
new_array = kmalloc(
- offsetof(struct ext4_xattr_inode_array,
- inodes[count + EIA_INCR]),
- GFP_NOFS);
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count + EIA_INCR),
+ GFP_NOFS);
if (new_array == NULL)
return -ENOMEM;
memcpy(new_array, *ea_inode_array,
- offsetof(struct ext4_xattr_inode_array, inodes[count]));
+ struct_size(*ea_inode_array, inodes,
+ (*ea_inode_array)->count));
kfree(*ea_inode_array);
*ea_inode_array = new_array;
}
- (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
+ (*ea_inode_array)->count++;
+ (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
return 0;
}
@@ -3036,8 +3034,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
*
* Create a new entry in the extended attribute block cache, and insert
* it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
*/
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
@@ -3065,8 +3061,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
*
* Compare two extended attribute blocks for equality.
*
- * Returns 0 if the blocks are equal, 1 if they differ, and
- * a negative error number on errors.
+ * Returns 0 if the blocks are equal, 1 if they differ.
*/
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index bd97c4aa8177..b25c2d7b5f99 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -32,8 +32,7 @@ struct ext4_xattr_header {
__le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */
- __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */
- /* id = inum if refcount=1, blknum otherwise */
+ __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */
__u32 h_reserved[3]; /* zero right now */
};
@@ -130,8 +129,8 @@ struct ext4_xattr_ibody_find {
};
struct ext4_xattr_inode_array {
- unsigned int count; /* # of used items in the array */
- struct inode *inodes[];
+ unsigned int count;
+ struct inode *inodes[] __counted_by(count);
};
extern const struct xattr_handler ext4_xattr_user_handler;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bdd96329dddd..7f76460b721f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -99,7 +99,7 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
- f2fs_handle_page_eio(sbi, page->index, META);
+ f2fs_handle_page_eio(sbi, page_folio(page), META);
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -345,30 +345,31 @@ static int __f2fs_write_meta_page(struct page *page,
enum iostat_type io_type)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct folio *folio = page_folio(page);
- trace_f2fs_writepage(page_folio(page), META);
+ trace_f2fs_writepage(folio, META);
if (unlikely(f2fs_cp_error(sbi))) {
if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_META);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
goto redirty_out;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
+ if (wbc->for_reclaim && folio->index < GET_SUM_BLOCK(sbi, 0))
goto redirty_out;
- f2fs_do_write_meta_page(sbi, page, io_type);
+ f2fs_do_write_meta_page(sbi, folio, io_type);
dec_page_count(sbi, F2FS_DIRTY_META);
if (wbc->for_reclaim)
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META);
- unlock_page(page);
+ folio_unlock(folio);
if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_write(sbi, META);
@@ -1551,7 +1552,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++)
f2fs_update_meta_page(sbi, nm_i->nat_bits +
- (i << F2FS_BLKSIZE_BITS), blk + i);
+ F2FS_BLK_TO_BYTES(i), blk + i);
}
/* write out checkpoint buffer at block 0 */
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 990b93689b46..7f26440e8595 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -90,11 +90,13 @@ bool f2fs_is_compressed_page(struct page *page)
static void f2fs_set_compressed_page(struct page *page,
struct inode *inode, pgoff_t index, void *data)
{
- attach_page_private(page, (void *)data);
+ struct folio *folio = page_folio(page);
+
+ folio_attach_private(folio, (void *)data);
/* i_crypto_info and iv index */
- page->index = index;
- page->mapping = inode->i_mapping;
+ folio->index = index;
+ folio->mapping = inode->i_mapping;
}
static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock)
@@ -160,17 +162,17 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse)
cc->cluster_idx = NULL_CLUSTER;
}
-void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
+void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio)
{
unsigned int cluster_ofs;
- if (!f2fs_cluster_can_merge_page(cc, page->index))
+ if (!f2fs_cluster_can_merge_page(cc, folio->index))
f2fs_bug_on(F2FS_I_SB(cc->inode), 1);
- cluster_ofs = offset_in_cluster(cc, page->index);
- cc->rpages[cluster_ofs] = page;
+ cluster_ofs = offset_in_cluster(cc, folio->index);
+ cc->rpages[cluster_ofs] = folio_page(folio, 0);
cc->nr_rpages++;
- cc->cluster_idx = cluster_idx(cc, page->index);
+ cc->cluster_idx = cluster_idx(cc, folio->index);
}
#ifdef CONFIG_F2FS_FS_LZO
@@ -879,7 +881,7 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc)
f2fs_bug_on(F2FS_I_SB(cc->inode), !page);
/* beyond EOF */
- if (page->index >= nr_pages)
+ if (page_folio(page)->index >= nr_pages)
return true;
}
return false;
@@ -945,7 +947,7 @@ static int __f2fs_get_cluster_blocks(struct inode *inode,
unsigned int cluster_size = F2FS_I(inode)->i_cluster_size;
int count, i;
- for (i = 1, count = 1; i < cluster_size; i++) {
+ for (i = 0, count = 0; i < cluster_size; i++) {
block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + i);
@@ -956,8 +958,8 @@ static int __f2fs_get_cluster_blocks(struct inode *inode,
return count;
}
-static int __f2fs_cluster_blocks(struct inode *inode,
- unsigned int cluster_idx, bool compr_blks)
+static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx,
+ enum cluster_check_type type)
{
struct dnode_of_data dn;
unsigned int start_idx = cluster_idx <<
@@ -978,10 +980,12 @@ static int __f2fs_cluster_blocks(struct inode *inode,
}
if (dn.data_blkaddr == COMPRESS_ADDR) {
- if (compr_blks)
- ret = __f2fs_get_cluster_blocks(inode, &dn);
- else
+ if (type == CLUSTER_COMPR_BLKS)
+ ret = 1 + __f2fs_get_cluster_blocks(inode, &dn);
+ else if (type == CLUSTER_IS_COMPR)
ret = 1;
+ } else if (type == CLUSTER_RAW_BLKS) {
+ ret = __f2fs_get_cluster_blocks(inode, &dn);
}
fail:
f2fs_put_dnode(&dn);
@@ -991,7 +995,16 @@ fail:
/* return # of compressed blocks in compressed cluster */
static int f2fs_compressed_blocks(struct compress_ctx *cc)
{
- return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true);
+ return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx,
+ CLUSTER_COMPR_BLKS);
+}
+
+/* return # of raw blocks in non-compressed cluster */
+static int f2fs_decompressed_blocks(struct inode *inode,
+ unsigned int cluster_idx)
+{
+ return __f2fs_cluster_blocks(inode, cluster_idx,
+ CLUSTER_RAW_BLKS);
}
/* return whether cluster is compressed one or not */
@@ -999,7 +1012,16 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
return __f2fs_cluster_blocks(inode,
index >> F2FS_I(inode)->i_log_cluster_size,
- false);
+ CLUSTER_IS_COMPR);
+}
+
+/* return whether cluster contains non raw blocks or not */
+bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index)
+{
+ unsigned int cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size;
+
+ return f2fs_decompressed_blocks(inode, cluster_idx) !=
+ F2FS_I(inode)->i_cluster_size;
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -1093,7 +1115,7 @@ retry:
if (PageUptodate(page))
f2fs_put_page(page, 1);
else
- f2fs_compress_ctx_add_page(cc, page);
+ f2fs_compress_ctx_add_page(cc, page_folio(page));
}
if (!f2fs_cluster_is_empty(cc)) {
@@ -1123,7 +1145,7 @@ retry:
}
f2fs_wait_on_page_writeback(page, DATA, true, true);
- f2fs_compress_ctx_add_page(cc, page);
+ f2fs_compress_ctx_add_page(cc, page_folio(page));
if (!PageUptodate(page)) {
release_and_retry:
@@ -1523,7 +1545,8 @@ continue_unlock:
if (!clear_page_dirty_for_io(cc->rpages[i]))
goto continue_unlock;
- ret = f2fs_write_single_data_page(cc->rpages[i], &submitted,
+ ret = f2fs_write_single_data_page(page_folio(cc->rpages[i]),
+ &submitted,
NULL, NULL, wbc, io_type,
compr_blocks, false);
if (ret) {
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5dfa0207ad8f..94f7b084f601 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -7,7 +7,6 @@
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
-#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
@@ -355,7 +354,7 @@ static void f2fs_write_end_io(struct bio *bio)
}
f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
- page->index != nid_of_node(page));
+ page_folio(page)->index != nid_of_node(page));
dec_page_count(sbi, type);
if (f2fs_in_warm_node_list(sbi, page))
@@ -704,7 +703,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio = __bio_alloc(fio, 1);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
- fio->page->index, fio, GFP_NOIO);
+ page_folio(fio->page)->index, fio, GFP_NOIO);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
@@ -803,7 +802,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
fio->new_blkaddr));
if (f2fs_crypt_mergeable_bio(*bio,
fio->page->mapping->host,
- fio->page->index, fio) &&
+ page_folio(fio->page)->index, fio) &&
bio_add_page(*bio, page, PAGE_SIZE, 0) ==
PAGE_SIZE) {
ret = 0;
@@ -903,7 +902,7 @@ alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
- fio->page->index, fio, GFP_NOIO);
+ page_folio(fio->page)->index, fio, GFP_NOIO);
add_bio_entry(fio->sbi, bio, page, fio->temp);
} else {
@@ -996,13 +995,13 @@ next:
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
fio->new_blkaddr) ||
!f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host,
- bio_page->index, fio)))
+ page_folio(bio_page)->index, fio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
- bio_page->index, fio, GFP_NOIO);
+ page_folio(bio_page)->index, fio, GFP_NOIO);
io->fio = *fio;
}
@@ -1087,7 +1086,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
}
/* This can handle encryption stuffs */
-static int f2fs_submit_page_read(struct inode *inode, struct page *page,
+static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
block_t blkaddr, blk_opf_t op_flags,
bool for_write)
{
@@ -1095,14 +1094,14 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
struct bio *bio;
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
- page->index, for_write);
+ folio->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, blkaddr);
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) {
iostat_update_and_unbind_ctx(bio);
if (bio->bi_private)
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
@@ -1270,7 +1269,7 @@ got_it:
return page;
}
- err = f2fs_submit_page_read(inode, page, dn.data_blkaddr,
+ err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr,
op_flags, for_write);
if (err)
goto put_err;
@@ -1713,6 +1712,14 @@ skip:
dn.ofs_in_node = end_offset;
}
+ if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ map->m_may_create) {
+ /* the next block to be allocated may not be contiguous. */
+ if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) ==
+ CAP_BLKS_PER_SEC(sbi) - 1)
+ goto sync_out;
+ }
+
if (pgofs >= end)
goto sync_out;
else if (dn.ofs_in_node < end_offset)
@@ -1939,7 +1946,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
inode_lock_shared(inode);
- maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+ maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode));
if (start > maxbytes) {
ret = -EFBIG;
goto out;
@@ -2064,7 +2071,7 @@ out:
static inline loff_t f2fs_readpage_limit(struct inode *inode)
{
if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
- return inode->i_sb->s_maxbytes;
+ return F2FS_BLK_TO_BYTES(max_file_blocks(inode));
return i_size_read(inode);
}
@@ -2208,19 +2215,22 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
struct page *page = cc->rpages[i];
+ struct folio *folio;
if (!page)
continue;
- if ((sector_t)page->index >= last_block_in_file) {
- zero_user_segment(page, 0, PAGE_SIZE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- } else if (!PageUptodate(page)) {
+
+ folio = page_folio(page);
+ if ((sector_t)folio->index >= last_block_in_file) {
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ } else if (!folio_test_uptodate(folio)) {
continue;
}
- unlock_page(page);
+ folio_unlock(folio);
if (for_write)
- put_page(page);
+ folio_put(folio);
cc->rpages[i] = NULL;
cc->nr_rpages--;
}
@@ -2280,7 +2290,7 @@ skip_reading_dnode:
}
for (i = 0; i < cc->nr_cpages; i++) {
- struct page *page = dic->cpages[i];
+ struct folio *folio = page_folio(dic->cpages[i]);
block_t blkaddr;
struct bio_post_read_ctx *ctx;
@@ -2290,7 +2300,8 @@ skip_reading_dnode:
f2fs_wait_on_block_writeback(inode, blkaddr);
- if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
+ if (f2fs_load_compressed_page(sbi, folio_page(folio, 0),
+ blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages)) {
f2fs_decompress_cluster(dic, true);
break;
@@ -2300,7 +2311,7 @@ skip_reading_dnode:
if (bio && (!page_is_mergeable(sbi, bio,
*last_block_in_bio, blkaddr) ||
- !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) {
+ !f2fs_crypt_mergeable_bio(bio, inode, folio->index, NULL))) {
submit_and_realloc:
f2fs_submit_read_bio(sbi, bio, DATA);
bio = NULL;
@@ -2309,7 +2320,7 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
f2fs_ra_op_flags(rac),
- page->index, for_write);
+ folio->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
f2fs_decompress_end_io(dic, ret, true);
@@ -2319,7 +2330,7 @@ submit_and_realloc:
}
}
- if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ if (!bio_add_folio(bio, folio, blocksize, 0))
goto submit_and_realloc;
ctx = get_post_read_ctx(bio);
@@ -2430,7 +2441,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
if (ret)
goto set_error_page;
- f2fs_compress_ctx_add_page(&cc, &folio->page);
+ f2fs_compress_ctx_add_page(&cc, folio);
goto next_page;
read_single_page:
@@ -2645,21 +2656,24 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio)
int f2fs_do_write_data_page(struct f2fs_io_info *fio)
{
- struct page *page = fio->page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(fio->page);
+ struct inode *inode = folio->mapping->host;
struct dnode_of_data dn;
struct node_info ni;
bool ipu_force = false;
+ bool atomic_commit;
int err = 0;
/* Use COW inode to make dnode_of_data for atomic write */
- if (f2fs_is_atomic_file(inode))
+ atomic_commit = f2fs_is_atomic_file(inode) &&
+ page_private_atomic(folio_page(folio, 0));
+ if (atomic_commit)
set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
else
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) &&
- f2fs_lookup_read_extent_cache_block(inode, page->index,
+ f2fs_lookup_read_extent_cache_block(inode, folio->index,
&fio->old_blkaddr)) {
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE))
@@ -2674,7 +2688,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
return -EAGAIN;
- err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE);
if (err)
goto out;
@@ -2682,8 +2696,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
- ClearPageUptodate(page);
- clear_page_private_gcing(page);
+ folio_clear_uptodate(folio);
+ clear_page_private_gcing(folio_page(folio, 0));
goto out_writepage;
}
got_it:
@@ -2709,7 +2723,7 @@ got_it:
if (err)
goto out_writepage;
- set_page_writeback(page);
+ folio_start_writeback(folio);
f2fs_put_dnode(&dn);
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
@@ -2717,11 +2731,11 @@ got_it:
if (err) {
if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- end_page_writeback(page);
+ folio_end_writeback(folio);
} else {
set_inode_flag(inode, FI_UPDATE_WRITE);
}
- trace_f2fs_do_write_data_page(page_folio(page), IPU);
+ trace_f2fs_do_write_data_page(folio, IPU);
return err;
}
@@ -2743,15 +2757,17 @@ got_it:
if (err)
goto out_writepage;
- set_page_writeback(page);
+ folio_start_writeback(folio);
if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR)
f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false);
/* LFS mode write path */
f2fs_outplace_write_data(&dn, fio);
- trace_f2fs_do_write_data_page(page_folio(page), OPU);
+ trace_f2fs_do_write_data_page(folio, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
+ if (atomic_commit)
+ clear_page_private_atomic(folio_page(folio, 0));
out_writepage:
f2fs_put_dnode(&dn);
out:
@@ -2760,7 +2776,7 @@ out:
return err;
}
-int f2fs_write_single_data_page(struct page *page, int *submitted,
+int f2fs_write_single_data_page(struct folio *folio, int *submitted,
struct bio **bio,
sector_t *last_block,
struct writeback_control *wbc,
@@ -2768,12 +2784,13 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
int compr_blocks,
bool allow_balance)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
+ struct page *page = folio_page(folio, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long)i_size)
>> PAGE_SHIFT;
- loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
+ loff_t psize = (loff_t)(folio->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
bool quota_inode = IS_NOQUOTA(inode);
@@ -2797,11 +2814,11 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
.last_block = last_block,
};
- trace_f2fs_writepage(page_folio(page), DATA);
+ trace_f2fs_writepage(folio, DATA);
/* we should bypass data pages to proceed the kworker jobs */
if (unlikely(f2fs_cp_error(sbi))) {
- mapping_set_error(page->mapping, -EIO);
+ mapping_set_error(folio->mapping, -EIO);
/*
* don't drop any dirty dentry pages for keeping lastest
* directory structure.
@@ -2819,7 +2836,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (page->index < end_index ||
+ if (folio->index < end_index ||
f2fs_verity_in_progress(inode) ||
compr_blocks)
goto write;
@@ -2829,10 +2846,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
* this page does not have to be written to disk.
*/
offset = i_size & (PAGE_SIZE - 1);
- if ((page->index >= end_index + 1) || !offset)
+ if ((folio->index >= end_index + 1) || !offset)
goto out;
- zero_user_segment(page, offset, PAGE_SIZE);
+ folio_zero_segment(folio, offset, folio_size(folio));
write:
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || quota_inode) {
@@ -2862,7 +2879,7 @@ write:
err = -EAGAIN;
if (f2fs_has_inline_data(inode)) {
- err = f2fs_write_inline_data(inode, page);
+ err = f2fs_write_inline_data(inode, folio);
if (!err)
goto out;
}
@@ -2892,7 +2909,7 @@ done:
out:
inode_dec_dirty_pages(inode);
if (err) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
clear_page_private_gcing(page);
}
@@ -2902,7 +2919,7 @@ out:
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
- unlock_page(page);
+ folio_unlock(folio);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
!F2FS_I(inode)->wb_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs);
@@ -2920,7 +2937,7 @@ out:
return 0;
redirty_out:
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
/*
* pageout() in MM translates EAGAIN, so calls handle_write_error()
* -> mapping_set_error() -> set_bit(AS_EIO, ...).
@@ -2929,29 +2946,30 @@ redirty_out:
*/
if (!err || wbc->for_reclaim)
return AOP_WRITEPAGE_ACTIVATE;
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
goto out;
if (f2fs_compressed_file(inode)) {
- if (f2fs_is_compressed_cluster(inode, page->index)) {
- redirty_page_for_writepage(wbc, page);
+ if (f2fs_is_compressed_cluster(inode, folio->index)) {
+ folio_redirty_for_writepage(wbc, folio);
return AOP_WRITEPAGE_ACTIVATE;
}
}
out:
#endif
- return f2fs_write_single_data_page(page, NULL, NULL, NULL,
+ return f2fs_write_single_data_page(folio, NULL, NULL, NULL,
wbc, FS_DATA_IO, 0, true);
}
@@ -3157,11 +3175,11 @@ continue_unlock:
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
folio_get(folio);
- f2fs_compress_ctx_add_page(&cc, &folio->page);
+ f2fs_compress_ctx_add_page(&cc, folio);
continue;
}
#endif
- ret = f2fs_write_single_data_page(&folio->page,
+ ret = f2fs_write_single_data_page(folio,
&submitted, &bio, &last_block,
wbc, io_type, 0, true);
if (ret == AOP_WRITEPAGE_ACTIVATE)
@@ -3369,11 +3387,11 @@ void f2fs_write_failed(struct inode *inode, loff_t to)
}
static int prepare_write_begin(struct f2fs_sb_info *sbi,
- struct page *page, loff_t pos, unsigned len,
+ struct folio *folio, loff_t pos, unsigned int len,
block_t *blk_addr, bool *node_changed)
{
- struct inode *inode = page->mapping->host;
- pgoff_t index = page->index;
+ struct inode *inode = folio->mapping->host;
+ pgoff_t index = folio->index;
struct dnode_of_data dn;
struct page *ipage;
bool locked = false;
@@ -3410,13 +3428,13 @@ restart:
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA(inode)) {
- f2fs_do_read_inline_data(page_folio(page), ipage);
+ f2fs_do_read_inline_data(folio, ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
set_page_private_inline(ipage);
goto out;
}
- err = f2fs_convert_inline_page(&dn, page);
+ err = f2fs_convert_inline_page(&dn, folio_page(folio, 0));
if (err || dn.data_blkaddr != NULL_ADDR)
goto out;
}
@@ -3509,12 +3527,12 @@ unlock_out:
}
static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
- struct page *page, loff_t pos, unsigned int len,
+ struct folio *folio, loff_t pos, unsigned int len,
block_t *blk_addr, bool *node_changed, bool *use_cow)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct inode *cow_inode = F2FS_I(inode)->cow_inode;
- pgoff_t index = page->index;
+ pgoff_t index = folio->index;
int err = 0;
block_t ori_blk_addr = NULL_ADDR;
@@ -3620,10 +3638,10 @@ repeat:
*foliop = folio;
if (f2fs_is_atomic_file(inode))
- err = prepare_atomic_write_begin(sbi, &folio->page, pos, len,
+ err = prepare_atomic_write_begin(sbi, folio, pos, len,
&blkaddr, &need_balance, &use_cow);
else
- err = prepare_write_begin(sbi, &folio->page, pos, len,
+ err = prepare_write_begin(sbi, folio, pos, len,
&blkaddr, &need_balance);
if (err)
goto put_folio;
@@ -3648,7 +3666,7 @@ repeat:
if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
- folio_zero_segment(folio, len, PAGE_SIZE);
+ folio_zero_segment(folio, len, folio_size(folio));
return 0;
}
@@ -3662,8 +3680,8 @@ repeat:
goto put_folio;
}
err = f2fs_submit_page_read(use_cow ?
- F2FS_I(inode)->cow_inode : inode, &folio->page,
- blkaddr, 0, true);
+ F2FS_I(inode)->cow_inode : inode,
+ folio, blkaddr, 0, true);
if (err)
goto put_folio;
@@ -3727,6 +3745,9 @@ static int f2fs_write_end(struct file *file,
folio_mark_dirty(folio);
+ if (f2fs_is_atomic_file(inode))
+ set_page_private_atomic(folio_page(folio, 0));
+
if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
f2fs_i_size_write(inode, pos + copied);
@@ -4117,9 +4138,8 @@ const struct address_space_operations f2fs_dblock_aops = {
.swap_deactivate = f2fs_swap_deactivate,
};
-void f2fs_clear_page_cache_dirty_tag(struct page *page)
+void f2fs_clear_page_cache_dirty_tag(struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
unsigned long flags;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8b0e1e71b667..546b8ba91261 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -275,7 +275,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build nm */
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
- si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS);
+ si->base_mem += F2FS_BLK_TO_BYTES(NM_I(sbi)->nat_bits_blocks);
si->base_mem += NM_I(sbi)->nat_blocks *
f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK);
si->base_mem += NM_I(sbi)->nat_blocks / 8;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index cbd7a5e96a37..1136539a57a8 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -166,7 +166,8 @@ static unsigned long dir_block_index(unsigned int level,
unsigned long bidx = 0;
for (i = 0; i < level; i++)
- bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
+ bidx += mul_u32_u32(dir_buckets(i, dir_level),
+ bucket_blocks(i));
bidx += idx * bucket_blocks(level);
return bidx;
}
@@ -841,6 +842,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ pgoff_t index = page_folio(page)->index;
int i;
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
@@ -866,8 +868,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
if (bit_pos == NR_DENTRY_IN_BLOCK &&
- !f2fs_truncate_hole(dir, page->index, page->index + 1)) {
- f2fs_clear_page_cache_dirty_tag(page);
+ !f2fs_truncate_hole(dir, index, index + 1)) {
+ f2fs_clear_page_cache_dirty_tag(page_folio(page));
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
clear_page_private_all(page);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index fd1fc06359ee..62ac440d9416 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -366,7 +366,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
static void __drop_largest_extent(struct extent_tree *et,
pgoff_t fofs, unsigned int len)
{
- if (fofs < et->largest.fofs + et->largest.len &&
+ if (fofs < (pgoff_t)et->largest.fofs + et->largest.len &&
fofs + len > et->largest.fofs) {
et->largest.len = 0;
et->largest_updated = true;
@@ -456,7 +456,7 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
if (type == EX_READ &&
et->largest.fofs <= pgofs &&
- et->largest.fofs + et->largest.len > pgofs) {
+ (pgoff_t)et->largest.fofs + et->largest.len > pgofs) {
*ei = et->largest;
ret = true;
stat_inc_largest_node_hit(sbi);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ac19c61f0c3e..33f5449dc22d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -11,7 +11,6 @@
#include <linux/uio.h>
#include <linux/types.h>
#include <linux/page-flags.h>
-#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/crc32.h>
#include <linux/magic.h>
@@ -134,6 +133,12 @@ typedef u32 nid_t;
#define COMPRESS_EXT_NUM 16
+enum blkzone_allocation_policy {
+ BLKZONE_ALLOC_PRIOR_SEQ, /* Prioritize writing to sequential zones */
+ BLKZONE_ALLOC_ONLY_SEQ, /* Only allow writing to sequential zones */
+ BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */
+};
+
/*
* An implementation of an rwsem that is explicitly unfair to readers. This
* prevents priority inversion when a low-priority reader acquires the read lock
@@ -285,6 +290,7 @@ enum {
APPEND_INO, /* for append ino list */
UPDATE_INO, /* for update ino list */
TRANS_DIR_INO, /* for transactions dir ino list */
+ XATTR_DIR_INO, /* for xattr updated dir ino list */
FLUSH_INO, /* for multiple device flushing */
MAX_INO_ENTRY, /* max. list */
};
@@ -784,7 +790,6 @@ enum {
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
FI_DATA_EXIST, /* indicate data exists */
- FI_INLINE_DOTS, /* indicate inline dot dentries */
FI_SKIP_WRITES, /* should skip data page writeback */
FI_OPU_WRITE, /* used for opu per file */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
@@ -802,6 +807,7 @@ enum {
FI_ALIGNED_WRITE, /* enable aligned write */
FI_COW_FILE, /* indicate COW file */
FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */
+ FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */
FI_ATOMIC_REPLACE, /* indicate atomic replace */
FI_OPENED_FILE, /* indicate file has been opened */
FI_MAX, /* max flag, never be used */
@@ -1155,6 +1161,7 @@ enum cp_reason_type {
CP_FASTBOOT_MODE,
CP_SPEC_LOG_NUM,
CP_RECOVER_DIR,
+ CP_XATTR_DIR,
};
enum iostat_type {
@@ -1293,6 +1300,7 @@ struct f2fs_gc_control {
bool no_bg_gc; /* check the space and stop bg_gc */
bool should_migrate_blocks; /* should migrate blocks */
bool err_gc_skipped; /* return EAGAIN if GC skipped */
+ bool one_time; /* require one time GC in one migration unit */
unsigned int nr_free_secs; /* # of free sections to do GC */
};
@@ -1418,7 +1426,8 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr);
* bit 1 PAGE_PRIVATE_ONGOING_MIGRATION
* bit 2 PAGE_PRIVATE_INLINE_INODE
* bit 3 PAGE_PRIVATE_REF_RESOURCE
- * bit 4- f2fs private data
+ * bit 4 PAGE_PRIVATE_ATOMIC_WRITE
+ * bit 5- f2fs private data
*
* Layout B: lowest bit should be 0
* page.private is a wrapped pointer.
@@ -1428,6 +1437,7 @@ enum {
PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */
PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */
PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */
+ PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */
PAGE_PRIVATE_MAX
};
@@ -1559,6 +1569,8 @@ struct f2fs_sb_info {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int blocks_per_blkz; /* F2FS blocks per zone */
unsigned int max_open_zones; /* max open zone resources of the zoned device */
+ /* For adjust the priority writing position of data in zone UFS */
+ unsigned int blkzone_alloc_policy;
#endif
/* for node-related operations */
@@ -1685,6 +1697,8 @@ struct f2fs_sb_info {
unsigned int max_victim_search;
/* migration granularity of garbage collection, unit: segment */
unsigned int migration_granularity;
+ /* migration window granularity of garbage collection, unit: segment */
+ unsigned int migration_window_granularity;
/*
* for stat information.
@@ -1994,6 +2008,16 @@ static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
return (struct f2fs_super_block *)(sbi->raw_super);
}
+static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio,
+ pgoff_t index)
+{
+ pgoff_t idx_in_folio = index % (1 << folio_order(folio));
+
+ return (struct f2fs_super_block *)
+ (page_address(folio_page(folio, idx_in_folio)) +
+ F2FS_SUPER_OFFSET);
+}
+
static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
{
return (struct f2fs_checkpoint *)(sbi->ckpt);
@@ -2396,14 +2420,17 @@ static inline void clear_page_private_##name(struct page *page) \
PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE);
PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE);
PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
+PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
static inline unsigned long get_page_private_data(struct page *page)
{
@@ -2435,6 +2462,7 @@ static inline void clear_page_private_all(struct page *page)
clear_page_private_reference(page);
clear_page_private_gcing(page);
clear_page_private_inline(page);
+ clear_page_private_atomic(page);
f2fs_bug_on(F2FS_P_SB(page), page_private(page));
}
@@ -2854,13 +2882,26 @@ static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type)
return false;
}
+static inline bool is_inflight_read_io(struct f2fs_sb_info *sbi)
+{
+ return get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_DIO_READ);
+}
+
static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
{
+ bool zoned_gc = (type == GC_TIME &&
+ F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_BLKZONED));
+
if (sbi->gc_mode == GC_URGENT_HIGH)
return true;
- if (is_inflight_io(sbi, type))
- return false;
+ if (zoned_gc) {
+ if (is_inflight_read_io(sbi))
+ return false;
+ } else {
+ if (is_inflight_io(sbi, type))
+ return false;
+ }
if (sbi->gc_mode == GC_URGENT_MID)
return true;
@@ -2869,6 +2910,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
(type == DISCARD_TIME || type == GC_TIME))
return true;
+ if (zoned_gc)
+ return true;
+
return f2fs_time_over(sbi, type);
}
@@ -2900,26 +2944,27 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
}
static inline int f2fs_has_extra_attr(struct inode *inode);
-static inline block_t data_blkaddr(struct inode *inode,
- struct page *node_page, unsigned int offset)
+static inline unsigned int get_dnode_base(struct inode *inode,
+ struct page *node_page)
{
- struct f2fs_node *raw_node;
- __le32 *addr_array;
- int base = 0;
- bool is_inode = IS_INODE(node_page);
+ if (!IS_INODE(node_page))
+ return 0;
- raw_node = F2FS_NODE(node_page);
+ return inode ? get_extra_isize(inode) :
+ offset_in_addr(&F2FS_NODE(node_page)->i);
+}
- if (is_inode) {
- if (!inode)
- /* from GC path only */
- base = offset_in_addr(&raw_node->i);
- else if (f2fs_has_extra_attr(inode))
- base = get_extra_isize(inode);
- }
+static inline __le32 *get_dnode_addr(struct inode *inode,
+ struct page *node_page)
+{
+ return blkaddr_in_node(F2FS_NODE(node_page)) +
+ get_dnode_base(inode, node_page);
+}
- addr_array = blkaddr_in_node(raw_node);
- return le32_to_cpu(addr_array[base + offset]);
+static inline block_t data_blkaddr(struct inode *inode,
+ struct page *node_page, unsigned int offset)
+{
+ return le32_to_cpu(*(get_dnode_addr(inode, node_page) + offset));
}
static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
@@ -3038,10 +3083,8 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
return;
fallthrough;
case FI_DATA_EXIST:
- case FI_INLINE_DOTS:
case FI_PIN_FILE:
case FI_COMPRESS_RELEASED:
- case FI_ATOMIC_COMMITTED:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -3163,8 +3206,6 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
set_bit(FI_INLINE_DENTRY, fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
set_bit(FI_DATA_EXIST, fi->flags);
- if (ri->i_inline & F2FS_INLINE_DOTS)
- set_bit(FI_INLINE_DOTS, fi->flags);
if (ri->i_inline & F2FS_EXTRA_ATTR)
set_bit(FI_EXTRA_ATTR, fi->flags);
if (ri->i_inline & F2FS_PIN_FILE)
@@ -3185,8 +3226,6 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
ri->i_inline |= F2FS_INLINE_DENTRY;
if (is_inode_flag_set(inode, FI_DATA_EXIST))
ri->i_inline |= F2FS_DATA_EXIST;
- if (is_inode_flag_set(inode, FI_INLINE_DOTS))
- ri->i_inline |= F2FS_INLINE_DOTS;
if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
ri->i_inline |= F2FS_EXTRA_ATTR;
if (is_inode_flag_set(inode, FI_PIN_FILE))
@@ -3267,11 +3306,6 @@ static inline int f2fs_exist_data(struct inode *inode)
return is_inode_flag_set(inode, FI_DATA_EXIST);
}
-static inline int f2fs_has_inline_dots(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_INLINE_DOTS);
-}
-
static inline int f2fs_is_mmap_file(struct inode *inode)
{
return is_inode_flag_set(inode, FI_MMAP_FILE);
@@ -3292,8 +3326,6 @@ static inline bool f2fs_is_cow_file(struct inode *inode)
return is_inode_flag_set(inode, FI_COW_FILE);
}
-static inline __le32 *get_dnode_addr(struct inode *inode,
- struct page *node_page);
static inline void *inline_data_addr(struct inode *inode, struct page *page)
{
__le32 *addr = get_dnode_addr(inode, page);
@@ -3432,17 +3464,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
return F2FS_I(inode)->i_inline_xattr_size;
}
-static inline __le32 *get_dnode_addr(struct inode *inode,
- struct page *node_page)
-{
- int base = 0;
-
- if (IS_INODE(node_page) && f2fs_has_extra_attr(inode))
- base = get_extra_isize(inode);
-
- return blkaddr_in_node(F2FS_NODE(node_page)) + base;
-}
-
#define f2fs_get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -3495,7 +3516,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
- bool readonly);
+ bool readonly, bool need_lock);
int f2fs_precache_extents(struct inode *inode);
int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int f2fs_fileattr_set(struct mnt_idmap *idmap,
@@ -3719,7 +3740,7 @@ bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src,
block_t blk_addr);
-void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
enum iostat_type io_type);
void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio);
void f2fs_outplace_write_data(struct dnode_of_data *dn,
@@ -3759,8 +3780,7 @@ void f2fs_destroy_segment_manager_caches(void);
int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint);
enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp);
-unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
- unsigned int segno);
+unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi);
unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
unsigned int segno);
@@ -3868,7 +3888,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int f2fs_encrypt_one_page(struct f2fs_io_info *fio);
bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
-int f2fs_write_single_data_page(struct page *page, int *submitted,
+int f2fs_write_single_data_page(struct folio *folio, int *submitted,
struct bio **bio, sector_t *last_block,
struct writeback_control *wbc,
enum iostat_type io_type,
@@ -3877,7 +3897,7 @@ void f2fs_write_failed(struct inode *inode, loff_t to);
void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
bool f2fs_release_folio(struct folio *folio, gfp_t wait);
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
-void f2fs_clear_page_cache_dirty_tag(struct page *page);
+void f2fs_clear_page_cache_dirty_tag(struct folio *folio);
int f2fs_init_post_read_processing(void);
void f2fs_destroy_post_read_processing(void);
int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi);
@@ -3901,7 +3921,7 @@ void f2fs_destroy_garbage_collection_cache(void);
/* victim selection function for cleaning and SSR */
int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
int gc_type, int type, char alloc_mode,
- unsigned long long age);
+ unsigned long long age, bool one_time);
/*
* recovery.c
@@ -3987,7 +4007,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_cp_call_count(sbi, foreground) \
atomic_inc(&sbi->cp_call_count[(foreground)])
-#define stat_inc_cp_count(si) (F2FS_STAT(sbi)->cp_count++)
+#define stat_inc_cp_count(sbi) (F2FS_STAT(sbi)->cp_count++)
#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
@@ -4172,7 +4192,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
-int f2fs_write_inline_data(struct inode *inode, struct page *page);
+int f2fs_write_inline_data(struct inode *inode, struct folio *folio);
int f2fs_recover_inline_data(struct inode *inode, struct page *npage);
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
const struct f2fs_filename *fname,
@@ -4289,6 +4309,11 @@ static inline bool f2fs_meta_inode_gc_required(struct inode *inode)
* compress.c
*/
#ifdef CONFIG_F2FS_FS_COMPRESSION
+enum cluster_check_type {
+ CLUSTER_IS_COMPR, /* check only if compressed cluster */
+ CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */
+ CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */
+};
bool f2fs_is_compressed_page(struct page *page);
struct page *f2fs_compress_control_page(struct page *page);
int f2fs_prepare_compress_overwrite(struct inode *inode,
@@ -4309,12 +4334,13 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages,
int index, int nr_pages, bool uptodate);
bool f2fs_sanity_check_cluster(struct dnode_of_data *dn);
-void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
+void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio);
int f2fs_write_multi_pages(struct compress_ctx *cc,
int *submitted,
struct writeback_control *wbc,
enum iostat_type io_type);
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
+bool f2fs_is_sparse_cluster(struct inode *inode, pgoff_t index);
void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
pgoff_t fofs, block_t blkaddr,
unsigned int llen, unsigned int c_len);
@@ -4401,6 +4427,12 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
+static inline int f2fs_is_compressed_cluster(
+ struct inode *inode,
+ pgoff_t index) { return 0; }
+static inline bool f2fs_is_sparse_cluster(
+ struct inode *inode,
+ pgoff_t index) { return true; }
static inline void f2fs_update_read_extent_tree_range_compressed(
struct inode *inode,
pgoff_t fofs, block_t blkaddr,
@@ -4653,9 +4685,11 @@ static inline void f2fs_io_schedule_timeout(long timeout)
io_schedule_timeout(timeout);
}
-static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
- enum page_type type)
+static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi,
+ struct folio *folio, enum page_type type)
{
+ pgoff_t ofs = folio->index;
+
if (unlikely(f2fs_cp_error(sbi)))
return;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 168f08507004..9ae54c4c72fe 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -8,7 +8,6 @@
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/stat.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/falloc.h>
@@ -54,7 +53,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -86,7 +85,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
- int ret = f2fs_is_compressed_cluster(inode, page->index);
+ int ret = f2fs_is_compressed_cluster(inode, folio->index);
if (ret < 0) {
err = ret;
@@ -106,11 +105,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
file_update_time(vmf->vma->vm_file);
filemap_invalidate_lock_shared(inode->i_mapping);
- lock_page(page);
- if (unlikely(page->mapping != inode->i_mapping ||
- page_offset(page) > i_size_read(inode) ||
- !PageUptodate(page))) {
- unlock_page(page);
+ folio_lock(folio);
+ if (unlikely(folio->mapping != inode->i_mapping ||
+ folio_pos(folio) > i_size_read(inode) ||
+ !folio_test_uptodate(folio))) {
+ folio_unlock(folio);
err = -EFAULT;
goto out_sem;
}
@@ -118,9 +117,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_alloc) {
/* block allocation */
- err = f2fs_get_block_locked(&dn, page->index);
+ err = f2fs_get_block_locked(&dn, folio->index);
} else {
- err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE);
f2fs_put_dnode(&dn);
if (f2fs_is_pinned_file(inode) &&
!__is_valid_data_blkaddr(dn.data_blkaddr))
@@ -128,11 +127,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
}
if (err) {
- unlock_page(page);
+ folio_unlock(folio);
goto out_sem;
}
- f2fs_wait_on_page_writeback(page, DATA, false, true);
+ f2fs_wait_on_page_writeback(folio_page(folio, 0), DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -140,18 +139,18 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
/*
* check to see if the page is mapped already (no holes)
*/
- if (PageMappedToDisk(page))
+ if (folio_test_mappedtodisk(folio))
goto out_sem;
/* page is wholly or partially inside EOF */
- if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
+ if (((loff_t)(folio->index + 1) << PAGE_SHIFT) >
i_size_read(inode)) {
loff_t offset;
offset = i_size_read(inode) & ~PAGE_MASK;
- zero_user_segment(page, offset, PAGE_SIZE);
+ folio_zero_segment(folio, offset, folio_size(folio));
}
- set_page_dirty(page);
+ folio_mark_dirty(folio);
f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE);
f2fs_update_time(sbi, REQ_TIME);
@@ -163,7 +162,7 @@ out_sem:
out:
ret = vmf_fs_error(err);
- trace_f2fs_vm_page_mkwrite(inode, page->index, vmf->vma->vm_flags, ret);
+ trace_f2fs_vm_page_mkwrite(inode, folio->index, vmf->vma->vm_flags, ret);
return ret;
}
@@ -218,6 +217,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino,
TRANS_DIR_INO))
cp_reason = CP_RECOVER_DIR;
+ else if (f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino,
+ XATTR_DIR_INO))
+ cp_reason = CP_XATTR_DIR;
return cp_reason;
}
@@ -373,8 +375,7 @@ sync_nodes:
f2fs_remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(inode, FI_APPEND_WRITE);
flush_out:
- if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ||
- (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi)))
+ if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER)
ret = f2fs_issue_flush(sbi, inode->i_ino);
if (!ret) {
f2fs_remove_ino_entry(sbi, ino, UPDATE_INO);
@@ -431,7 +432,7 @@ static bool __found_offset(struct address_space *mapping,
static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode));
struct dnode_of_data dn;
pgoff_t pgofs, end_offset;
loff_t data_ofs = offset;
@@ -513,10 +514,7 @@ fail:
static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes = inode->i_sb->s_maxbytes;
-
- if (f2fs_compressed_file(inode))
- maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+ loff_t maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode));
switch (whence) {
case SEEK_SET:
@@ -1052,6 +1050,13 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return err;
}
+ /*
+ * wait for inflight dio, blocks should be removed after
+ * IO completion.
+ */
+ if (attr->ia_size < old_size)
+ inode_dio_wait(inode);
+
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
@@ -1888,6 +1893,12 @@ static long f2fs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * wait for inflight dio, blocks should be removed after IO
+ * completion.
+ */
+ inode_dio_wait(inode);
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
goto out;
@@ -2116,10 +2127,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
struct mnt_idmap *idmap = file_mnt_idmap(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode *pinode;
loff_t isize;
int ret;
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
@@ -2149,6 +2162,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
goto out;
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[READ]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
@@ -2158,27 +2172,18 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
- if (ret) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- goto out;
- }
+ if (ret)
+ goto out_unlock;
/* Check if the inode already has a COW inode */
if (fi->cow_inode == NULL) {
/* Create a COW inode for atomic write */
- pinode = f2fs_iget(inode->i_sb, fi->i_pino);
- if (IS_ERR(pinode)) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- ret = PTR_ERR(pinode);
- goto out;
- }
+ struct dentry *dentry = file_dentry(filp);
+ struct inode *dir = d_inode(dentry->d_parent);
- ret = f2fs_get_tmpfile(idmap, pinode, &fi->cow_inode);
- iput(pinode);
- if (ret) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- goto out;
- }
+ ret = f2fs_get_tmpfile(idmap, dir, &fi->cow_inode);
+ if (ret)
+ goto out_unlock;
set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
@@ -2187,11 +2192,13 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
F2FS_I(fi->cow_inode)->atomic_inode = inode;
} else {
/* Reuse the already created COW inode */
+ f2fs_bug_on(sbi, get_dirty_pages(fi->cow_inode));
+
+ invalidate_mapping_pages(fi->cow_inode->i_mapping, 0, -1);
+
ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true);
- if (ret) {
- f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
- goto out;
- }
+ if (ret)
+ goto out_unlock;
}
f2fs_write_inode(inode, NULL);
@@ -2210,7 +2217,11 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
}
f2fs_i_size_write(fi->cow_inode, isize);
+out_unlock:
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ if (ret)
+ goto out;
f2fs_update_time(sbi, REQ_TIME);
fi->atomic_write_task = current;
@@ -2228,6 +2239,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
struct mnt_idmap *idmap = file_mnt_idmap(filp);
int ret;
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
@@ -2260,6 +2274,9 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp)
struct mnt_idmap *idmap = file_mnt_idmap(filp);
int ret;
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (!inode_owner_or_capable(idmap, inode))
return -EACCES;
@@ -2279,7 +2296,7 @@ static int f2fs_ioc_abort_atomic_write(struct file *filp)
}
int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
- bool readonly)
+ bool readonly, bool need_lock)
{
struct super_block *sb = sbi->sb;
int ret = 0;
@@ -2326,12 +2343,19 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
if (readonly)
goto out;
+ /* grab sb->s_umount to avoid racing w/ remount() */
+ if (need_lock)
+ down_read(&sbi->sb->s_umount);
+
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
f2fs_drop_discard_cmd(sbi);
clear_opt(sbi, DISCARD);
+ if (need_lock)
+ up_read(&sbi->sb->s_umount);
+
f2fs_update_time(sbi, REQ_TIME);
out:
@@ -2368,7 +2392,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
}
}
- ret = f2fs_do_shutdown(sbi, in, readonly);
+ ret = f2fs_do_shutdown(sbi, in, readonly, true);
if (need_drop)
mnt_drop_write_file(filp);
@@ -2686,7 +2710,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
(range->start + range->len) >> PAGE_SHIFT,
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) ||
+ f2fs_is_atomic_file(inode)) {
err = -EINVAL;
goto unlock_out;
}
@@ -2710,7 +2735,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
* block addresses are continuous.
*/
if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
- if (ei.fofs + ei.len >= pg_end)
+ if ((pgoff_t)ei.fofs + ei.len >= pg_end)
goto out;
}
@@ -2793,6 +2818,8 @@ do_map:
goto clear_out;
}
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
set_page_private_gcing(page);
f2fs_put_page(page, 1);
@@ -2917,6 +2944,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out_unlock;
}
+ if (f2fs_is_atomic_file(src) || f2fs_is_atomic_file(dst)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
ret = -EINVAL;
if (pos_in + len > src->i_size || pos_in + len < pos_in)
goto out_unlock;
@@ -2968,9 +3000,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
}
f2fs_lock_op(sbi);
- ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS,
- pos_out >> F2FS_BLKSIZE_BITS,
- len >> F2FS_BLKSIZE_BITS, false);
+ ret = __exchange_data_block(src, dst, F2FS_BYTES_TO_BLK(pos_in),
+ F2FS_BYTES_TO_BLK(pos_out),
+ F2FS_BYTES_TO_BLK(len), false);
if (!ret) {
if (dst_max_i_size)
@@ -3014,10 +3046,10 @@ static int __f2fs_ioc_move_range(struct file *filp,
return -EBADF;
dst = fdget(range->dst_fd);
- if (!dst.file)
+ if (!fd_file(dst))
return -EBADF;
- if (!(dst.file->f_mode & FMODE_WRITE)) {
+ if (!(fd_file(dst)->f_mode & FMODE_WRITE)) {
err = -EBADF;
goto err_out;
}
@@ -3026,7 +3058,7 @@ static int __f2fs_ioc_move_range(struct file *filp,
if (err)
goto err_out;
- err = f2fs_move_file_range(filp, range->pos_in, dst.file,
+ err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst),
range->pos_out, range->len);
mnt_drop_write_file(filp);
@@ -3300,6 +3332,11 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
inode_lock(inode);
+ if (f2fs_is_atomic_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (!pin) {
clear_inode_flag(inode, FI_PIN_FILE);
f2fs_i_gc_failures_write(inode, 0);
@@ -4193,6 +4230,8 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
/* It will never fail, when page has pinned above */
f2fs_bug_on(F2FS_I_SB(inode), !page);
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
set_page_dirty(page);
set_page_private_gcing(page);
f2fs_put_page(page, 1);
@@ -4207,9 +4246,8 @@ static int f2fs_ioc_decompress_file(struct file *filp)
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- pgoff_t page_idx = 0, last_idx;
- int cluster_size = fi->i_cluster_size;
- int count, ret;
+ pgoff_t page_idx = 0, last_idx, cluster_idx;
+ int ret;
if (!f2fs_sb_has_compression(sbi) ||
F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER)
@@ -4244,10 +4282,15 @@ static int f2fs_ioc_decompress_file(struct file *filp)
goto out;
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ last_idx >>= fi->i_log_cluster_size;
- count = last_idx - page_idx;
- while (count && count >= cluster_size) {
- ret = redirty_blocks(inode, page_idx, cluster_size);
+ for (cluster_idx = 0; cluster_idx < last_idx; cluster_idx++) {
+ page_idx = cluster_idx << fi->i_log_cluster_size;
+
+ if (!f2fs_is_compressed_cluster(inode, page_idx))
+ continue;
+
+ ret = redirty_blocks(inode, page_idx, fi->i_cluster_size);
if (ret < 0)
break;
@@ -4257,9 +4300,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
break;
}
- count -= cluster_size;
- page_idx += cluster_size;
-
cond_resched();
if (fatal_signal_pending(current)) {
ret = -EINTR;
@@ -4286,9 +4326,9 @@ static int f2fs_ioc_compress_file(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- pgoff_t page_idx = 0, last_idx;
- int cluster_size = F2FS_I(inode)->i_cluster_size;
- int count, ret;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ pgoff_t page_idx = 0, last_idx, cluster_idx;
+ int ret;
if (!f2fs_sb_has_compression(sbi) ||
F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER)
@@ -4322,10 +4362,15 @@ static int f2fs_ioc_compress_file(struct file *filp)
set_inode_flag(inode, FI_ENABLE_COMPRESS);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ last_idx >>= fi->i_log_cluster_size;
+
+ for (cluster_idx = 0; cluster_idx < last_idx; cluster_idx++) {
+ page_idx = cluster_idx << fi->i_log_cluster_size;
+
+ if (f2fs_is_sparse_cluster(inode, page_idx))
+ continue;
- count = last_idx - page_idx;
- while (count && count >= cluster_size) {
- ret = redirty_blocks(inode, page_idx, cluster_size);
+ ret = redirty_blocks(inode, page_idx, fi->i_cluster_size);
if (ret < 0)
break;
@@ -4335,9 +4380,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
break;
}
- count -= cluster_size;
- page_idx += cluster_size;
-
cond_resched();
if (fatal_signal_pending(current)) {
ret = -EINTR;
@@ -4538,6 +4580,13 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
+ /* dio is not compatible w/ atomic file */
+ if (f2fs_is_atomic_file(inode)) {
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
/*
* We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
* the higher-level function iomap_dio_rw() in order to ensure that the
@@ -4597,6 +4646,10 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos,
iov_iter_count(to), READ);
+ /* In LFS mode, if there is inflight dio, wait for its completion */
+ if (f2fs_lfs_mode(F2FS_I_SB(inode)))
+ inode_dio_wait(inode);
+
if (f2fs_should_use_dio(inode, iocb, to)) {
ret = f2fs_dio_read_iter(iocb, to);
} else {
@@ -4949,6 +5002,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Determine whether we will do a direct write or a buffered write. */
dio = f2fs_should_use_dio(inode, iocb, from);
+ /* dio is not compatible w/ atomic write */
+ if (dio && f2fs_is_atomic_file(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
/* Possibly preallocate the blocks for the write. */
target_size = iocb->ki_pos + iov_iter_count(from);
preallocated = f2fs_preallocate_blocks(iocb, from, dio);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 724bbcb447d3..9322a7200e31 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -81,6 +81,8 @@ static int gc_thread_func(void *data)
continue;
}
+ gc_control.one_time = false;
+
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
@@ -116,15 +118,30 @@ static int gc_thread_func(void *data)
goto next;
}
- if (has_enough_invalid_blocks(sbi))
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ if (has_enough_free_blocks(sbi,
+ gc_th->no_zoned_gc_percent)) {
+ wait_ms = gc_th->no_gc_sleep_time;
+ f2fs_up_write(&sbi->gc_lock);
+ goto next;
+ }
+ if (wait_ms == gc_th->no_gc_sleep_time)
+ wait_ms = gc_th->max_sleep_time;
+ }
+
+ if (need_to_boost_gc(sbi)) {
decrease_sleep_time(gc_th, &wait_ms);
- else
+ if (f2fs_sb_has_blkzoned(sbi))
+ gc_control.one_time = true;
+ } else {
increase_sleep_time(gc_th, &wait_ms);
+ }
do_gc:
stat_inc_gc_call_count(sbi, foreground ?
FOREGROUND : BACKGROUND);
- sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+ sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) ||
+ gc_control.one_time;
/* foreground GC was been triggered via f2fs_balance_fs() */
if (foreground)
@@ -179,9 +196,21 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
return -ENOMEM;
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
- gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
- gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
- gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
+ gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO;
+
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED;
+ gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED;
+ gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED;
+ gc_th->no_zoned_gc_percent = LIMIT_NO_ZONED_GC;
+ gc_th->boost_zoned_gc_percent = LIMIT_BOOST_ZONED_GC;
+ } else {
+ gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
+ gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
+ gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
+ gc_th->no_zoned_gc_percent = 0;
+ gc_th->boost_zoned_gc_percent = 0;
+ }
gc_th->gc_wake = false;
@@ -339,7 +368,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned char age = 0;
unsigned char u;
unsigned int i;
- unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno);
+ unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi);
for (i = 0; i < usable_segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
@@ -368,6 +397,11 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >=
+ CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio /
+ 100))
+ return UINT_MAX;
+
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
return get_valid_blocks(sbi, segno, true);
@@ -742,7 +776,7 @@ static int f2fs_gc_pinned_control(struct inode *inode, int gc_type,
*/
int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
int gc_type, int type, char alloc_mode,
- unsigned long long age)
+ unsigned long long age, bool one_time)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct sit_info *sm = SIT_I(sbi);
@@ -759,6 +793,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
p.alloc_mode = alloc_mode;
p.age = age;
p.age_threshold = sbi->am.age_threshold;
+ p.one_time_gc = one_time;
retry:
select_policy(sbi, gc_type, type, &p);
@@ -1670,13 +1705,14 @@ next_step:
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
- int gc_type)
+ int gc_type, bool one_time)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
down_write(&sit_i->sentry_lock);
- ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS, 0);
+ ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE,
+ LFS, 0, one_time);
up_write(&sit_i->sentry_lock);
return ret;
}
@@ -1684,30 +1720,49 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
static int do_garbage_collect(struct f2fs_sb_info *sbi,
unsigned int start_segno,
struct gc_inode_list *gc_list, int gc_type,
- bool force_migrate)
+ bool force_migrate, bool one_time)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi);
+ unsigned int sec_end_segno;
int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE;
int submitted = 0;
- if (__is_large_section(sbi))
- end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
+ if (__is_large_section(sbi)) {
+ sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
- /*
- * zone-capacity can be less than zone-size in zoned devices,
- * resulting in less than expected usable segments in the zone,
- * calculate the end segno in the zone which can be garbage collected
- */
- if (f2fs_sb_has_blkzoned(sbi))
- end_segno -= SEGS_PER_SEC(sbi) -
- f2fs_usable_segs_in_sec(sbi, segno);
+ /*
+ * zone-capacity can be less than zone-size in zoned devices,
+ * resulting in less than expected usable segments in the zone,
+ * calculate the end segno in the zone which can be garbage
+ * collected
+ */
+ if (f2fs_sb_has_blkzoned(sbi))
+ sec_end_segno -= SEGS_PER_SEC(sbi) -
+ f2fs_usable_segs_in_sec(sbi);
+
+ if (gc_type == BG_GC || one_time) {
+ unsigned int window_granularity =
+ sbi->migration_window_granularity;
+
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ !has_enough_free_blocks(sbi,
+ sbi->gc_thread->boost_zoned_gc_percent))
+ window_granularity *=
+ BOOST_GC_MULTIPLE;
+
+ end_segno = start_segno + window_granularity;
+ }
+
+ if (end_segno > sec_end_segno)
+ end_segno = sec_end_segno;
+ }
sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
@@ -1786,7 +1841,8 @@ freed:
if (__is_large_section(sbi))
sbi->next_victim_seg[gc_type] =
- (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
+ (segno + 1 < sec_end_segno) ?
+ segno + 1 : NULL_SEGNO;
skip:
f2fs_put_page(sum_page, 0);
}
@@ -1863,7 +1919,7 @@ gc_more:
goto stop;
}
retry:
- ret = __get_victim(sbi, &segno, gc_type);
+ ret = __get_victim(sbi, &segno, gc_type, gc_control->one_time);
if (ret) {
/* allow to search victim from sections has pinned data */
if (ret == -ENODATA && gc_type == FG_GC &&
@@ -1875,17 +1931,21 @@ retry:
}
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type,
- gc_control->should_migrate_blocks);
+ gc_control->should_migrate_blocks,
+ gc_control->one_time);
if (seg_freed < 0)
goto stop;
total_freed += seg_freed;
- if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) {
+ if (seg_freed == f2fs_usable_segs_in_sec(sbi)) {
sec_freed++;
total_sec_freed++;
}
+ if (gc_control->one_time)
+ goto stop;
+
if (gc_type == FG_GC) {
sbi->cur_victim_sec = NULL_SEGNO;
@@ -2010,8 +2070,7 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- do_garbage_collect(sbi, segno, &gc_list, FG_GC,
- dry_run_sections == 0);
+ do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false);
put_gc_inode(&gc_list);
if (!dry_run && get_valid_blocks(sbi, segno, true))
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index a8ea3301b815..2914b678bf8f 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -15,16 +15,27 @@
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
+/* GC sleep parameters for zoned deivces */
+#define DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED 10
+#define DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED 20
+#define DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED 60000
+
/* choose candidates from sections which has age of more than 7 days */
#define DEF_GC_THREAD_AGE_THRESHOLD (60 * 60 * 24 * 7)
#define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */
#define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */
#define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */
+#define DEF_GC_THREAD_VALID_THRESH_RATIO 95 /* do not GC over 95% valid block ratio for one time GC */
#define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
+#define LIMIT_NO_ZONED_GC 60 /* percentage over total user space of no gc for zoned devices */
+#define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */
+#define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3
+#define BOOST_GC_MULTIPLE 5
+
#define DEF_GC_FAILED_PINNED_FILES 2048
#define MAX_GC_FAILED_PINNED_FILES USHRT_MAX
@@ -51,6 +62,11 @@ struct f2fs_gc_kthread {
* caller of f2fs_balance_fs()
* will wait on this wait queue.
*/
+
+ /* for gc control for zoned devices */
+ unsigned int no_zoned_gc_percent;
+ unsigned int boost_zoned_gc_percent;
+ unsigned int valid_thresh_ratio;
};
struct gc_inode_list {
@@ -152,6 +168,12 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
*wait -= min_time;
}
+static inline bool has_enough_free_blocks(struct f2fs_sb_info *sbi,
+ unsigned int limit_perc)
+{
+ return free_sections(sbi) > ((sbi->total_sections * limit_perc) / 100);
+}
+
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
{
block_t user_block_count = sbi->user_block_count;
@@ -167,3 +189,10 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
free_user_blocks(sbi) <
limit_free_user_blocks(invalid_user_blocks));
}
+
+static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi)
+{
+ if (f2fs_sb_has_blkzoned(sbi))
+ return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC);
+ return has_enough_invalid_blocks(sbi);
+}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index cca7d448e55c..005babf1bed1 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -260,35 +260,34 @@ out:
return err;
}
-int f2fs_write_inline_data(struct inode *inode, struct page *page)
+int f2fs_write_inline_data(struct inode *inode, struct folio *folio)
{
- struct dnode_of_data dn;
- int err;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct page *ipage;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE);
- if (err)
- return err;
+ ipage = f2fs_get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
if (!f2fs_has_inline_data(inode)) {
- f2fs_put_dnode(&dn);
+ f2fs_put_page(ipage, 1);
return -EAGAIN;
}
- f2fs_bug_on(F2FS_I_SB(inode), page->index);
+ f2fs_bug_on(F2FS_I_SB(inode), folio->index);
- f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
- memcpy_from_page(inline_data_addr(inode, dn.inode_page),
- page, 0, MAX_INLINE_DATA(inode));
- set_page_dirty(dn.inode_page);
+ f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ memcpy_from_folio(inline_data_addr(inode, ipage),
+ folio, 0, MAX_INLINE_DATA(inode));
+ set_page_dirty(ipage);
- f2fs_clear_page_cache_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(folio);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
- clear_page_private_inline(dn.inode_page);
- f2fs_put_dnode(&dn);
+ clear_page_private_inline(ipage);
+ f2fs_put_page(ipage, 1);
return 0;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index aef57172014f..1ed86df343a5 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -7,7 +7,6 @@
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
#include <linux/lz4.h>
@@ -35,6 +34,11 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
if (f2fs_inode_dirtied(inode, sync))
return;
+ if (f2fs_is_atomic_file(inode)) {
+ set_inode_flag(inode, FI_ATOMIC_DIRTIED);
+ return;
+ }
+
mark_inode_dirty_sync(inode);
}
@@ -175,7 +179,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
if (provided != calculated)
f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x",
- page->index, ino_of_node(page), provided, calculated);
+ page_folio(page)->index, ino_of_node(page),
+ provided, calculated);
return provided == calculated;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 38b4750475db..57d46e1439de 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -457,62 +457,6 @@ struct dentry *f2fs_get_parent(struct dentry *child)
return d_obtain_alias(f2fs_iget(child->d_sb, ino));
}
-static int __recover_dot_dentries(struct inode *dir, nid_t pino)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct qstr dot = QSTR_INIT(".", 1);
- struct f2fs_dir_entry *de;
- struct page *page;
- int err = 0;
-
- if (f2fs_readonly(sbi->sb)) {
- f2fs_info(sbi, "skip recovering inline_dots inode (ino:%lu, pino:%u) in readonly mountpoint",
- dir->i_ino, pino);
- return 0;
- }
-
- if (!S_ISDIR(dir->i_mode)) {
- f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)",
- dir->i_ino, dir->i_mode, pino);
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- return -ENOTDIR;
- }
-
- err = f2fs_dquot_initialize(dir);
- if (err)
- return err;
-
- f2fs_balance_fs(sbi, true);
-
- f2fs_lock_op(sbi);
-
- de = f2fs_find_entry(dir, &dot, &page);
- if (de) {
- f2fs_put_page(page, 0);
- } else if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto out;
- } else {
- err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
- if (err)
- goto out;
- }
-
- de = f2fs_find_entry(dir, &dotdot_name, &page);
- if (de)
- f2fs_put_page(page, 0);
- else if (IS_ERR(page))
- err = PTR_ERR(page);
- else
- err = f2fs_do_add_link(dir, &dotdot_name, NULL, pino, S_IFDIR);
-out:
- if (!err)
- clear_inode_flag(dir, FI_INLINE_DOTS);
-
- f2fs_unlock_op(sbi);
- return err;
-}
-
static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -522,7 +466,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *new;
nid_t ino = -1;
int err = 0;
- unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
struct f2fs_filename fname;
trace_f2fs_lookup_start(dir, dentry, flags);
@@ -558,17 +501,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
- err = __recover_dot_dentries(dir, root_ino);
- if (err)
- goto out_iput;
- }
-
- if (f2fs_has_inline_dots(inode)) {
- err = __recover_dot_dentries(inode, dir->i_ino);
- if (err)
- goto out_iput;
- }
if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b72ef96f7e33..59b13ff243fa 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -20,7 +20,7 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
-#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
+#define on_f2fs_build_free_nids(nm_i) mutex_is_locked(&(nm_i)->build_lock)
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
@@ -123,7 +123,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
static void clear_node_page_dirty(struct page *page)
{
if (PageDirty(page)) {
- f2fs_clear_page_cache_dirty_tag(page);
+ f2fs_clear_page_cache_dirty_tag(page_folio(page));
clear_page_dirty_for_io(page);
dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
}
@@ -919,7 +919,7 @@ static int truncate_node(struct dnode_of_data *dn)
clear_node_page_dirty(dn->node_page);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- index = dn->node_page->index;
+ index = page_folio(dn->node_page)->index;
f2fs_put_page(dn->node_page, 1);
invalidate_mapping_pages(NODE_MAPPING(sbi),
@@ -1369,6 +1369,7 @@ fail:
*/
static int read_node_page(struct page *page, blk_opf_t op_flags)
{
+ struct folio *folio = page_folio(page);
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
struct f2fs_io_info fio = {
@@ -1381,21 +1382,21 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
};
int err;
- if (PageUptodate(page)) {
+ if (folio_test_uptodate(folio)) {
if (!f2fs_inode_chksum_verify(sbi, page)) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
return -EFSBADCRC;
}
return LOCKED_PAGE;
}
- err = f2fs_get_node_info(sbi, page->index, &ni, false);
+ err = f2fs_get_node_info(sbi, folio->index, &ni, false);
if (err)
return err;
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
return -ENOENT;
}
@@ -1492,7 +1493,7 @@ out_err:
out_put_err:
/* ENOENT comes from read_node_page which is not an error. */
if (err != -ENOENT)
- f2fs_handle_page_eio(sbi, page->index, NODE);
+ f2fs_handle_page_eio(sbi, page_folio(page), NODE);
f2fs_put_page(page, 1);
return ERR_PTR(err);
}
@@ -1535,7 +1536,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
if (!clear_page_dirty_for_io(page))
goto page_out;
- ret = f2fs_write_inline_data(inode, page);
+ ret = f2fs_write_inline_data(inode, page_folio(page));
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
if (ret)
@@ -1608,6 +1609,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
enum iostat_type io_type, unsigned int *seq_id)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct folio *folio = page_folio(page);
nid_t nid;
struct node_info ni;
struct f2fs_io_info fio = {
@@ -1624,15 +1626,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
};
unsigned int seq;
- trace_f2fs_writepage(page_folio(page), NODE);
+ trace_f2fs_writepage(folio, NODE);
if (unlikely(f2fs_cp_error(sbi))) {
/* keep node pages in remount-ro mode */
if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY)
goto redirty_out;
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -1646,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
/* get old block addr of this node page */
nid = nid_of_node(page);
- f2fs_bug_on(sbi, page->index != nid);
+ f2fs_bug_on(sbi, folio->index != nid);
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
goto redirty_out;
@@ -1660,10 +1662,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_NODES);
f2fs_up_read(&sbi->node_write);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
}
@@ -1674,7 +1676,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
}
- if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi))
+ if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
/* should add to global list before clearing PAGECACHE status */
@@ -1684,7 +1686,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
*seq_id = seq;
}
- set_page_writeback(page);
+ folio_start_writeback(folio);
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
@@ -1697,7 +1699,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
submitted = NULL;
}
- unlock_page(page);
+ folio_unlock(folio);
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_write(sbi, NODE);
@@ -1711,7 +1713,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
return 0;
redirty_out:
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return AOP_WRITEPAGE_ACTIVATE;
}
@@ -1867,7 +1869,7 @@ continue_unlock:
}
if (!ret && atomic && !marked) {
f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
- ino, last_page->index);
+ ino, page_folio(last_page)->index);
lock_page(last_page);
f2fs_wait_on_page_writeback(last_page, NODE, true, true);
set_page_dirty(last_page);
@@ -3166,7 +3168,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kvzalloc(sbi,
- nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
+ F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
@@ -3185,7 +3187,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
if (IS_ERR(page))
return PTR_ERR(page);
- memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
+ memcpy(nm_i->nat_bits + F2FS_BLK_TO_BYTES(i),
page_address(page), F2FS_BLKSIZE);
f2fs_put_page(page, 1);
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 78c3198a6308..1766254279d2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -199,6 +199,10 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
clear_inode_flag(inode, FI_ATOMIC_REPLACE);
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) {
+ clear_inode_flag(inode, FI_ATOMIC_DIRTIED);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
stat_dec_atomic_inode(inode);
F2FS_I(inode)->atomic_write_task = NULL;
@@ -366,6 +370,10 @@ out:
} else {
sbi->committed_atomic_block += fi->atomic_write_cnt;
set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) {
+ clear_inode_flag(inode, FI_ATOMIC_DIRTIED);
+ f2fs_mark_inode_dirty_sync(inode, true);
+ }
}
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
@@ -1282,6 +1290,13 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
wait_list, issued);
return 0;
}
+
+ /*
+ * Issue discard for conventional zones only if the device
+ * supports discard.
+ */
+ if (!bdev_max_discard_sectors(bdev))
+ return -EOPNOTSUPP;
}
#endif
@@ -2686,22 +2701,47 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
goto got_it;
}
+#ifdef CONFIG_BLK_DEV_ZONED
/*
* If we format f2fs on zoned storage, let's try to get pinned sections
* from beginning of the storage, which should be a conventional one.
*/
if (f2fs_sb_has_blkzoned(sbi)) {
- segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg);
+ /* Prioritize writing to conventional zones */
+ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning)
+ segno = 0;
+ else
+ segno = max(first_zoned_segno(sbi), *newseg);
hint = GET_SEC_FROM_SEG(sbi, segno);
}
+#endif
find_other_zone:
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) {
+ /* Write only to sequential zones */
+ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) {
+ hint = GET_SEC_FROM_SEG(sbi, first_zoned_segno(sbi));
+ secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
+ } else
+ secno = find_first_zero_bit(free_i->free_secmap,
+ MAIN_SECS(sbi));
+ if (secno >= MAIN_SECS(sbi)) {
+ ret = -ENOSPC;
+ f2fs_bug_on(sbi, 1);
+ goto out_unlock;
+ }
+ }
+#endif
+
if (secno >= MAIN_SECS(sbi)) {
secno = find_first_zero_bit(free_i->free_secmap,
MAIN_SECS(sbi));
if (secno >= MAIN_SECS(sbi)) {
ret = -ENOSPC;
+ f2fs_bug_on(sbi, 1);
goto out_unlock;
}
}
@@ -2743,10 +2783,8 @@ got_it:
out_unlock:
spin_unlock(&free_i->segmap_lock);
- if (ret == -ENOSPC) {
+ if (ret == -ENOSPC)
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
- f2fs_bug_on(sbi, 1);
- }
return ret;
}
@@ -3052,7 +3090,8 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
sanity_check_seg_type(sbi, seg_type);
/* f2fs_need_SSR() already forces to do this */
- if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) {
+ if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type,
+ alloc_mode, age, false)) {
curseg->next_segno = segno;
return 1;
}
@@ -3079,7 +3118,8 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
for (; cnt-- > 0; reversed ? i-- : i++) {
if (i == seg_type)
continue;
- if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) {
+ if (!f2fs_get_victim(sbi, &segno, BG_GC, i,
+ alloc_mode, age, false)) {
curseg->next_segno = segno;
return 1;
}
@@ -3522,7 +3562,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
- type = __get_age_segment_type(inode, fio->page->index);
+ type = __get_age_segment_type(inode,
+ page_folio(fio->page)->index);
if (type != NO_CHECK_TYPE)
return type;
@@ -3781,7 +3822,7 @@ out:
f2fs_up_read(&fio->sbi->io_order_lock);
}
-void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
enum iostat_type io_type)
{
struct f2fs_io_info fio = {
@@ -3790,20 +3831,20 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
.temp = HOT,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
- .old_blkaddr = page->index,
- .new_blkaddr = page->index,
- .page = page,
+ .old_blkaddr = folio->index,
+ .new_blkaddr = folio->index,
+ .page = folio_page(folio, 0),
.encrypted_page = NULL,
.in_list = 0,
};
- if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+ if (unlikely(folio->index >= MAIN_BLKADDR(sbi)))
fio.op_flags &= ~REQ_META;
- set_page_writeback(page);
+ folio_start_writeback(folio);
f2fs_submit_page_write(&fio);
- stat_inc_meta_count(sbi, page->index);
+ stat_inc_meta_count(sbi, folio->index);
f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE);
}
@@ -5381,8 +5422,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
return BLKS_PER_SEG(sbi);
}
-unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
- unsigned int segno)
+unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi)
{
if (f2fs_sb_has_blkzoned(sbi))
return CAP_SEGS_PER_SEC(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index bfc01a521cb9..71adb4a43bec 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -188,6 +188,7 @@ struct victim_sel_policy {
unsigned int min_segno; /* segment # having min. cost */
unsigned long long age; /* mtime of GCed section*/
unsigned long long age_threshold;/* age threshold */
+ bool one_time_gc; /* one time GC */
};
struct seg_entry {
@@ -430,7 +431,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
+ unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi);
spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
@@ -464,7 +465,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno);
+ unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi);
spin_lock(&free_i->segmap_lock);
if (test_and_clear_bit(segno, free_i->free_segmap)) {
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 176b5177c89d..87ab5696bd48 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -11,7 +11,6 @@
#include <linux/fs_context.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
-#include <linux/buffer_head.h>
#include <linux/kthread.h>
#include <linux/parser.h>
#include <linux/mount.h>
@@ -707,6 +706,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!strcmp(name, "on")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
} else if (!strcmp(name, "off")) {
+ if (f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_warn(sbi, "zoned devices need bggc");
+ kfree(name);
+ return -EINVAL;
+ }
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
} else if (!strcmp(name, "sync")) {
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
@@ -2561,7 +2565,7 @@ restore_opts:
static void f2fs_shutdown(struct super_block *sb)
{
- f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false);
+ f2fs_do_shutdown(F2FS_SB(sb), F2FS_GOING_DOWN_NOSYNC, false, false);
}
#ifdef CONFIG_QUOTA
@@ -3318,29 +3322,47 @@ loff_t max_file_blocks(struct inode *inode)
* fit within U32_MAX + 1 data units.
*/
- result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS);
+ result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096));
return result;
}
-static int __f2fs_commit_super(struct buffer_head *bh,
- struct f2fs_super_block *super)
+static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio,
+ pgoff_t index, bool update)
{
- lock_buffer(bh);
- if (super)
- memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
- set_buffer_dirty(bh);
- unlock_buffer(bh);
-
+ struct bio *bio;
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
+ blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA;
+ int ret;
+
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ if (update)
+ memcpy(F2FS_SUPER_BLOCK(folio, index), F2FS_RAW_SUPER(sbi),
+ sizeof(struct f2fs_super_block));
+ folio_mark_dirty(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+
+ bio = bio_alloc(sbi->sb->s_bdev, 1, opf, GFP_NOFS);
+
+ /* it doesn't need to set crypto context for superblock update */
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio_index(folio));
+
+ if (!bio_add_folio(bio, folio, folio_size(folio), 0))
+ f2fs_bug_on(sbi, 1);
+
+ ret = submit_bio_wait(bio);
+ folio_end_writeback(folio);
+
+ return ret;
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
- struct buffer_head *bh)
+ struct folio *folio, pgoff_t index)
{
- struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
- (bh->b_data + F2FS_SUPER_OFFSET);
+ struct f2fs_super_block *raw_super = F2FS_SUPER_BLOCK(folio, index);
struct super_block *sb = sbi->sb;
u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
@@ -3356,9 +3378,9 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
u32 segment_count = le32_to_cpu(raw_super->segment_count);
u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
u64 main_end_blkaddr = main_blkaddr +
- (segment_count_main << log_blocks_per_seg);
+ ((u64)segment_count_main << log_blocks_per_seg);
u64 seg_end_blkaddr = segment0_blkaddr +
- (segment_count << log_blocks_per_seg);
+ ((u64)segment_count << log_blocks_per_seg);
if (segment0_blkaddr != cp_blkaddr) {
f2fs_info(sbi, "Mismatch start address, segment0(%u) cp_blkaddr(%u)",
@@ -3415,7 +3437,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
set_sbi_flag(sbi, SBI_NEED_SB_WRITE);
res = "internally";
} else {
- err = __f2fs_commit_super(bh, NULL);
+ err = __f2fs_commit_super(sbi, folio, index, false);
res = err ? "failed" : "done";
}
f2fs_info(sbi, "Fix alignment : %s, start(%u) end(%llu) block(%u)",
@@ -3428,12 +3450,11 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
}
static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
- struct buffer_head *bh)
+ struct folio *folio, pgoff_t index)
{
block_t segment_count, segs_per_sec, secs_per_zone, segment_count_main;
block_t total_sections, blocks_per_seg;
- struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
- (bh->b_data + F2FS_SUPER_OFFSET);
+ struct f2fs_super_block *raw_super = F2FS_SUPER_BLOCK(folio, index);
size_t crc_offset = 0;
__u32 crc = 0;
@@ -3591,7 +3612,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
- if (sanity_check_area_boundary(sbi, bh))
+ if (sanity_check_area_boundary(sbi, folio, index))
return -EFSCORRUPTED;
return 0;
@@ -3786,6 +3807,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
sbi->migration_granularity = SEGS_PER_SEC(sbi);
+ sbi->migration_window_granularity = f2fs_sb_has_blkzoned(sbi) ?
+ DEF_MIGRATION_WINDOW_GRANULARITY_ZONED : SEGS_PER_SEC(sbi);
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
@@ -3938,7 +3961,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
{
struct super_block *sb = sbi->sb;
int block;
- struct buffer_head *bh;
+ struct folio *folio;
struct f2fs_super_block *super;
int err = 0;
@@ -3947,32 +3970,32 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
return -ENOMEM;
for (block = 0; block < 2; block++) {
- bh = sb_bread(sb, block);
- if (!bh) {
+ folio = read_mapping_folio(sb->s_bdev->bd_mapping, block, NULL);
+ if (IS_ERR(folio)) {
f2fs_err(sbi, "Unable to read %dth superblock",
block + 1);
- err = -EIO;
+ err = PTR_ERR(folio);
*recovery = 1;
continue;
}
/* sanity checking of raw super */
- err = sanity_check_raw_super(sbi, bh);
+ err = sanity_check_raw_super(sbi, folio, block);
if (err) {
f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock",
block + 1);
- brelse(bh);
+ folio_put(folio);
*recovery = 1;
continue;
}
if (!*raw_super) {
- memcpy(super, bh->b_data + F2FS_SUPER_OFFSET,
+ memcpy(super, F2FS_SUPER_BLOCK(folio, block),
sizeof(*super));
*valid_super_block = block;
*raw_super = super;
}
- brelse(bh);
+ folio_put(folio);
}
/* No valid superblock */
@@ -3986,7 +4009,8 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
- struct buffer_head *bh;
+ struct folio *folio;
+ pgoff_t index;
__u32 crc = 0;
int err;
@@ -4004,22 +4028,24 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
}
/* write back-up superblock first */
- bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1);
- if (!bh)
- return -EIO;
- err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
- brelse(bh);
+ index = sbi->valid_super_block ? 0 : 1;
+ folio = read_mapping_folio(sbi->sb->s_bdev->bd_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ err = __f2fs_commit_super(sbi, folio, index, true);
+ folio_put(folio);
/* if we are in recovery path, skip writing valid superblock */
if (recover || err)
return err;
/* write current valid superblock */
- bh = sb_bread(sbi->sb, sbi->valid_super_block);
- if (!bh)
- return -EIO;
- err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
- brelse(bh);
+ index = sbi->valid_super_block;
+ folio = read_mapping_folio(sbi->sb->s_bdev->bd_mapping, index, NULL);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ err = __f2fs_commit_super(sbi, folio, index, true);
+ folio_put(folio);
return err;
}
@@ -4173,12 +4199,14 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason,
}
f2fs_warn(sbi, "Remounting filesystem read-only");
+
/*
- * Make sure updated value of ->s_mount_flags will be visible before
- * ->s_flags update
+ * We have already set CP_ERROR_FLAG flag to stop all updates
+ * to filesystem, so it doesn't need to set SB_RDONLY flag here
+ * because the flag should be set covered w/ sb->s_umount semaphore
+ * via remount procedure, otherwise, it will confuse code like
+ * freeze_super() which will lead to deadlocks and other problems.
*/
- smp_wmb();
- sb->s_flags |= SB_RDONLY;
}
static void f2fs_record_error_work(struct work_struct *work)
@@ -4219,6 +4247,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->aligned_blksize = true;
#ifdef CONFIG_BLK_DEV_ZONED
sbi->max_open_zones = UINT_MAX;
+ sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ;
#endif
for (i = 0; i < max_devices; i++) {
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index fee7ee45ceaa..c56e8c873935 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -170,6 +170,12 @@ static ssize_t undiscard_blks_show(struct f2fs_attr *a,
SM_I(sbi)->dcc_info->undiscard_blks);
}
+static ssize_t atgc_enabled_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", sbi->am.atgc_enabled ? 1 : 0);
+}
+
static ssize_t gc_mode_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -182,50 +188,50 @@ static ssize_t features_show(struct f2fs_attr *a,
int len = 0;
if (f2fs_sb_has_encrypt(sbi))
- len += scnprintf(buf, PAGE_SIZE - len, "%s",
+ len += sysfs_emit_at(buf, len, "%s",
"encryption");
if (f2fs_sb_has_blkzoned(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "extra_attr");
if (f2fs_sb_has_project_quota(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "projquota");
if (f2fs_sb_has_inode_chksum(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "inode_checksum");
if (f2fs_sb_has_flexible_inline_xattr(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "flexible_inline_xattr");
if (f2fs_sb_has_quota_ino(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "quota_ino");
if (f2fs_sb_has_inode_crtime(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "inode_crtime");
if (f2fs_sb_has_lost_found(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "lost_found");
if (f2fs_sb_has_verity(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "sb_checksum");
if (f2fs_sb_has_casefold(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "casefold");
if (f2fs_sb_has_readonly(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "readonly");
if (f2fs_sb_has_compression(sbi))
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "compression");
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += sysfs_emit_at(buf, len, "%s%s",
len ? ", " : "", "pin_file");
- len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
return len;
}
@@ -323,17 +329,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
int hot_count = sbi->raw_super->hot_ext_count;
int len = 0, i;
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "cold file extension:\n");
+ len += sysfs_emit_at(buf, len, "cold file extension:\n");
for (i = 0; i < cold_count; i++)
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
- extlist[i]);
+ len += sysfs_emit_at(buf, len, "%s\n", extlist[i]);
- len += scnprintf(buf + len, PAGE_SIZE - len,
- "hot file extension:\n");
+ len += sysfs_emit_at(buf, len, "hot file extension:\n");
for (i = cold_count; i < cold_count + hot_count; i++)
- len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
- extlist[i]);
+ len += sysfs_emit_at(buf, len, "%s\n", extlist[i]);
+
return len;
}
@@ -561,6 +564,11 @@ out:
return -EINVAL;
}
+ if (!strcmp(a->attr.name, "migration_window_granularity")) {
+ if (t == 0 || t > SEGS_PER_SEC(sbi))
+ return -EINVAL;
+ }
+
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
@@ -627,6 +635,15 @@ out:
}
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (!strcmp(a->attr.name, "blkzone_alloc_policy")) {
+ if (t < BLKZONE_ALLOC_PRIOR_SEQ || t > BLKZONE_ALLOC_PRIOR_CONV)
+ return -EINVAL;
+ sbi->blkzone_alloc_policy = t;
+ return count;
+ }
+#endif
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block") ||
!strcmp(a->attr.name, "compr_saved_block")) {
@@ -775,7 +792,8 @@ out:
if (!strcmp(a->attr.name, "ipu_policy")) {
if (t >= BIT(F2FS_IPU_MAX))
return -EINVAL;
- if (t && f2fs_lfs_mode(sbi))
+ /* allow F2FS_IPU_NOCACHE only for IPU in the pinned file */
+ if (f2fs_lfs_mode(sbi) && (t & ~BIT(F2FS_IPU_NOCACHE)))
return -EINVAL;
SM_I(sbi)->ipu_policy = (unsigned int)t;
return count;
@@ -960,6 +978,9 @@ GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time);
GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time);
GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time);
GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
+GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent);
+GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent);
+GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio);
/* SM_INFO ATTR */
SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments);
@@ -969,6 +990,7 @@ SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks);
SM_INFO_GENERAL_RW_ATTR(min_seq_blocks);
SM_INFO_GENERAL_RW_ATTR(min_hot_blocks);
SM_INFO_GENERAL_RW_ATTR(min_ssr_sections);
+SM_INFO_GENERAL_RW_ATTR(reserved_segments);
/* DCC_INFO ATTR */
DCC_INFO_RW_ATTR(max_small_discards, max_discards);
@@ -1001,6 +1023,7 @@ F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold);
F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs);
F2FS_SBI_GENERAL_RW_ATTR(max_victim_search);
F2FS_SBI_GENERAL_RW_ATTR(migration_granularity);
+F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity);
F2FS_SBI_GENERAL_RW_ATTR(dir_level);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_SBI_GENERAL_RW_ATTR(iostat_enable);
@@ -1033,6 +1056,7 @@ F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold);
F2FS_SBI_GENERAL_RW_ATTR(last_age_weight);
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
+F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
/* STAT_INFO ATTR */
@@ -1072,6 +1096,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(atgc_enabled);
F2FS_GENERAL_RO_ATTR(gc_mode);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_GENERAL_RO_ATTR(moved_blocks_background);
@@ -1116,6 +1141,9 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_min_sleep_time),
ATTR_LIST(gc_max_sleep_time),
ATTR_LIST(gc_no_gc_sleep_time),
+ ATTR_LIST(gc_no_zoned_gc_percent),
+ ATTR_LIST(gc_boost_zoned_gc_percent),
+ ATTR_LIST(gc_valid_thresh_ratio),
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
@@ -1138,8 +1166,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_seq_blocks),
ATTR_LIST(min_hot_blocks),
ATTR_LIST(min_ssr_sections),
+ ATTR_LIST(reserved_segments),
ATTR_LIST(max_victim_search),
ATTR_LIST(migration_granularity),
+ ATTR_LIST(migration_window_granularity),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
@@ -1187,6 +1217,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(unusable_blocks_per_sec),
+ ATTR_LIST(blkzone_alloc_policy),
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compr_written_block),
@@ -1200,6 +1231,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(atgc_candidate_count),
ATTR_LIST(atgc_age_weight),
ATTR_LIST(atgc_age_threshold),
+ ATTR_LIST(atgc_enabled),
ATTR_LIST(seq_file_ra_mul),
ATTR_LIST(gc_segment_mode),
ATTR_LIST(gc_reclaimed_segments),
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 84a33fe49bed..2287f238ae09 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -74,7 +74,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *aops = mapping->a_ops;
- if (pos + count > inode->i_sb->s_maxbytes)
+ if (pos + count > F2FS_BLK_TO_BYTES(max_file_blocks(inode)))
return -EFBIG;
while (count) {
@@ -237,7 +237,8 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
pos = le64_to_cpu(dloc.pos);
/* Get the descriptor */
- if (pos + size < pos || pos + size > inode->i_sb->s_maxbytes ||
+ if (pos + size < pos ||
+ pos + size > F2FS_BLK_TO_BYTES(max_file_blocks(inode)) ||
pos < f2fs_verity_metadata_pos(inode) || size > INT_MAX) {
f2fs_warn(F2FS_I_SB(inode), "invalid verity xattr");
f2fs_handle_error(F2FS_I_SB(inode),
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index f290fe9327c4..3f3874943679 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -629,6 +629,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
struct page *ipage, int flags)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_entry *here, *last;
void *base_addr, *last_base_addr;
int found, newsize;
@@ -772,9 +773,18 @@ retry:
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
- if (S_ISDIR(inode->i_mode))
- set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
+ if (!S_ISDIR(inode->i_mode))
+ goto same;
+ /*
+ * In restrict mode, fsync() always try to trigger checkpoint for all
+ * metadata consistency, in other mode, it triggers checkpoint when
+ * parent's xattr metadata was updated.
+ */
+ if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
+ set_sbi_flag(sbi, SBI_NEED_CP);
+ else
+ f2fs_add_ino_entry(sbi, inode->i_ino, XATTR_DIR_INO);
same:
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 081e5e3d89ea..22dd9dcce7ec 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -405,7 +405,7 @@ static long f_dupfd_query(int fd, struct file *filp)
* overkill, but given our lockless file pointer lookup, the
* alternatives are complicated.
*/
- return f.file == filp;
+ return fd_file(f) == filp;
}
/* Let the caller figure out whether a given file was just created. */
@@ -573,17 +573,17 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
struct fd f = fdget_raw(fd);
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (!err)
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
out1:
fdput(f);
@@ -600,15 +600,15 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
struct flock64 flock;
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out1;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
goto out1;
@@ -618,7 +618,7 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
- err = fcntl_getlk64(f.file, cmd, &flock);
+ err = fcntl_getlk64(fd_file(f), cmd, &flock);
if (!err && copy_to_user(argp, &flock, sizeof(flock)))
err = -EFAULT;
break;
@@ -629,10 +629,10 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = -EFAULT;
if (copy_from_user(&flock, argp, sizeof(flock)))
break;
- err = fcntl_setlk64(fd, f.file, cmd, &flock);
+ err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
break;
default:
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
out1:
@@ -737,15 +737,15 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
struct flock flock;
long err = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
return err;
- if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
goto out_put;
}
- err = security_file_fcntl(f.file, cmd, arg);
+ err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
goto out_put;
@@ -754,7 +754,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
if (err)
break;
err = fixup_compat_flock(&flock);
@@ -766,7 +766,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
if (!err)
err = put_compat_flock64(&flock, compat_ptr(arg));
break;
@@ -775,7 +775,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = get_compat_flock(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
break;
case F_SETLK64:
case F_SETLKW64:
@@ -784,10 +784,10 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = get_compat_flock64(&flock, compat_ptr(arg));
if (err)
break;
- err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
break;
default:
- err = do_fcntl(fd, cmd, arg, f.file);
+ err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
out_put:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8cb665629f4a..82df28d45cd7 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -140,9 +140,9 @@ static int get_path_from_fd(int fd, struct path *root)
spin_unlock(&fs->lock);
} else {
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- *root = f.file->f_path;
+ *root = fd_file(f)->f_path;
path_get(root);
fdput(f);
}
diff --git a/fs/file.c b/fs/file.c
index 976ecd4ce2c6..5125607d040a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1124,7 +1124,7 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
struct files_struct *files = current->files;
struct file *file;
@@ -1141,22 +1141,22 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
if (likely(atomic_read_acquire(&files->count) == 1)) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
- return 0;
- return (unsigned long)file;
+ return EMPTY_FD;
+ return BORROWED_FD(file);
} else {
file = __fget_files(files, fd, mask);
if (!file)
- return 0;
- return FDPUT_FPUT | (unsigned long)file;
+ return EMPTY_FD;
+ return CLONED_FD(file);
}
}
-unsigned long __fdget(unsigned int fd)
+struct fd fdget(unsigned int fd)
{
return __fget_light(fd, FMODE_PATH);
}
-EXPORT_SYMBOL(__fdget);
+EXPORT_SYMBOL(fdget);
-unsigned long __fdget_raw(unsigned int fd)
+struct fd fdget_raw(unsigned int fd)
{
return __fget_light(fd, 0);
}
@@ -1177,16 +1177,16 @@ static inline bool file_needs_f_pos_lock(struct file *file)
(file_count(file) > 1 || file->f_op->iterate_shared);
}
-unsigned long __fdget_pos(unsigned int fd)
+struct fd fdget_pos(unsigned int fd)
{
- unsigned long v = __fdget(fd);
- struct file *file = (struct file *)(v & ~3);
+ struct fd f = fdget(fd);
+ struct file *file = fd_file(f);
if (file && file_needs_f_pos_lock(file)) {
- v |= FDPUT_POS_UNLOCK;
+ f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
- return v;
+ return f;
}
void __f_unlock_pos(struct file *f)
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ed2dd000622e..6cef3deccded 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -78,7 +78,6 @@ static int fscontext_release(struct inode *inode, struct file *file)
const struct file_operations fscontext_fops = {
.read = fscontext_read,
.release = fscontext_release,
- .llseek = no_llseek,
};
/*
@@ -394,13 +393,13 @@ SYSCALL_DEFINE5(fsconfig,
}
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
ret = -EINVAL;
- if (f.file->f_op != &fscontext_fops)
+ if (fd_file(f)->f_op != &fscontext_fops)
goto out_f;
- fc = f.file->private_data;
+ fc = fd_file(f)->private_data;
if (fc->ops == &legacy_fs_context_ops) {
switch (cmd) {
case FSCONFIG_SET_BINARY:
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 6e0228c6d0cb..ce0ff7a9007b 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,6 +3,9 @@
# Makefile for the FUSE filesystem.
#
+# Needed for trace events
+ccflags-y = -I$(src)
+
obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 04cfd8fee992..8f484b105f13 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -12,7 +12,6 @@
#include <linux/posix_acl_xattr.h>
static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc,
- struct mnt_idmap *idmap,
struct inode *inode, int type, bool rcu)
{
int size;
@@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap,
if (fuse_no_acl(fc, inode))
return ERR_PTR(-EOPNOTSUPP);
- return __fuse_get_acl(fc, idmap, inode, type, false);
+ return __fuse_get_acl(fc, inode, type, false);
}
struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu)
@@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu)
*/
if (!fc->posix_acl)
return NULL;
-
- return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu);
+ return __fuse_get_acl(fc, inode, type, rcu);
}
int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -146,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
* be stripped.
*/
if (fc->posix_acl &&
- !in_group_or_capable(&nop_mnt_idmap, inode,
- i_gid_into_vfsgid(&nop_mnt_idmap, inode)))
+ !in_group_or_capable(idmap, inode,
+ i_gid_into_vfsgid(idmap, inode)))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 97ac994ff78f..2a730d88cc3b 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -183,27 +183,23 @@ out:
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
- .llseek = no_llseek,
};
static const struct file_operations fuse_ctl_waiting_ops = {
.open = nonseekable_open,
.read = fuse_conn_waiting_read,
- .llseek = no_llseek,
};
static const struct file_operations fuse_conn_max_background_ops = {
.open = nonseekable_open,
.read = fuse_conn_max_background_read,
.write = fuse_conn_max_background_write,
- .llseek = no_llseek,
};
static const struct file_operations fuse_conn_congestion_threshold_ops = {
.open = nonseekable_open,
.read = fuse_conn_congestion_threshold_read,
.write = fuse_conn_congestion_threshold_write,
- .llseek = no_llseek,
};
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f0c9cd1a0b39..1f64ae6d7a69 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -22,6 +22,9 @@
#include <linux/splice.h>
#include <linux/sched.h>
+#define CREATE_TRACE_POINTS
+#include "fuse_trace.h"
+
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -105,11 +108,17 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
static void fuse_put_request(struct fuse_req *req);
-static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
+static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ bool for_background)
{
struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
+ bool no_idmap = !fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP);
+ kuid_t fsuid;
+ kgid_t fsgid;
int err;
+
atomic_inc(&fc->num_waiting);
if (fuse_block_alloc(fc, for_background)) {
@@ -137,19 +146,32 @@ static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
goto out;
}
- req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
- req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
- if (unlikely(req->in.h.uid == ((uid_t)-1) ||
- req->in.h.gid == ((gid_t)-1))) {
+ /*
+ * Keep the old behavior when idmappings support was not
+ * declared by a FUSE server.
+ *
+ * For those FUSE servers who support idmapped mounts,
+ * we send UID/GID only along with "inode creation"
+ * fuse requests, otherwise idmap == &invalid_mnt_idmap and
+ * req->in.h.{u,g}id will be equal to FUSE_INVALID_UIDGID.
+ */
+ fsuid = no_idmap ? current_fsuid() : mapped_fsuid(idmap, fc->user_ns);
+ fsgid = no_idmap ? current_fsgid() : mapped_fsgid(idmap, fc->user_ns);
+ req->in.h.uid = from_kuid(fc->user_ns, fsuid);
+ req->in.h.gid = from_kgid(fc->user_ns, fsgid);
+
+ if (no_idmap && unlikely(req->in.h.uid == ((uid_t)-1) ||
+ req->in.h.gid == ((gid_t)-1))) {
fuse_put_request(req);
return ERR_PTR(-EOVERFLOW);
}
+
return req;
out:
@@ -194,11 +216,22 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
}
EXPORT_SYMBOL_GPL(fuse_len_args);
-u64 fuse_get_unique(struct fuse_iqueue *fiq)
+static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq)
{
fiq->reqctr += FUSE_REQ_ID_STEP;
return fiq->reqctr;
}
+
+u64 fuse_get_unique(struct fuse_iqueue *fiq)
+{
+ u64 ret;
+
+ spin_lock(&fiq->lock);
+ ret = fuse_get_unique_locked(fiq);
+ spin_unlock(&fiq->lock);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(fuse_get_unique);
static unsigned int fuse_req_hash(u64 unique)
@@ -217,22 +250,70 @@ __releases(fiq->lock)
spin_unlock(&fiq->lock);
}
+static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget)
+{
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ fiq->forget_list_tail->next = forget;
+ fiq->forget_list_tail = forget;
+ fuse_dev_wake_and_unlock(fiq);
+ } else {
+ kfree(forget);
+ spin_unlock(&fiq->lock);
+ }
+}
+
+static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ spin_lock(&fiq->lock);
+ if (list_empty(&req->intr_entry)) {
+ list_add_tail(&req->intr_entry, &fiq->interrupts);
+ /*
+ * Pairs with smp_mb() implied by test_and_set_bit()
+ * from fuse_request_end().
+ */
+ smp_mb();
+ if (test_bit(FR_FINISHED, &req->flags)) {
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->lock);
+ } else {
+ fuse_dev_wake_and_unlock(fiq);
+ }
+ } else {
+ spin_unlock(&fiq->lock);
+ }
+}
+
+static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req)
+{
+ spin_lock(&fiq->lock);
+ if (fiq->connected) {
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique_locked(fiq);
+ list_add_tail(&req->list, &fiq->pending);
+ fuse_dev_wake_and_unlock(fiq);
+ } else {
+ spin_unlock(&fiq->lock);
+ req->out.h.error = -ENOTCONN;
+ clear_bit(FR_PENDING, &req->flags);
+ fuse_request_end(req);
+ }
+}
+
const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
- .wake_forget_and_unlock = fuse_dev_wake_and_unlock,
- .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock,
- .wake_pending_and_unlock = fuse_dev_wake_and_unlock,
+ .send_forget = fuse_dev_queue_forget,
+ .send_interrupt = fuse_dev_queue_interrupt,
+ .send_req = fuse_dev_queue_req,
};
EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
-static void queue_request_and_unlock(struct fuse_iqueue *fiq,
- struct fuse_req *req)
-__releases(fiq->lock)
+static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req)
{
req->in.h.len = sizeof(struct fuse_in_header) +
fuse_len_args(req->args->in_numargs,
(struct fuse_arg *) req->args->in_args);
- list_add_tail(&req->list, &fiq->pending);
- fiq->ops->wake_pending_and_unlock(fiq);
+ trace_fuse_request_send(req);
+ fiq->ops->send_req(fiq, req);
}
void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -243,15 +324,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
forget->forget_one.nodeid = nodeid;
forget->forget_one.nlookup = nlookup;
- spin_lock(&fiq->lock);
- if (fiq->connected) {
- fiq->forget_list_tail->next = forget;
- fiq->forget_list_tail = forget;
- fiq->ops->wake_forget_and_unlock(fiq);
- } else {
- kfree(forget);
- spin_unlock(&fiq->lock);
- }
+ fiq->ops->send_forget(fiq, forget);
}
static void flush_bg_queue(struct fuse_conn *fc)
@@ -265,9 +338,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
list_del(&req->list);
fc->active_background++;
- spin_lock(&fiq->lock);
- req->in.h.unique = fuse_get_unique(fiq);
- queue_request_and_unlock(fiq, req);
+ fuse_send_one(fiq, req);
}
}
@@ -288,6 +359,7 @@ void fuse_request_end(struct fuse_req *req)
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
+ trace_fuse_request_end(req);
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below FR_INTERRUPTED check. Pairs with
@@ -337,29 +409,12 @@ static int queue_interrupt(struct fuse_req *req)
{
struct fuse_iqueue *fiq = &req->fm->fc->iq;
- spin_lock(&fiq->lock);
/* Check for we've sent request to interrupt this req */
- if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
- spin_unlock(&fiq->lock);
+ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags)))
return -EINVAL;
- }
- if (list_empty(&req->intr_entry)) {
- list_add_tail(&req->intr_entry, &fiq->interrupts);
- /*
- * Pairs with smp_mb() implied by test_and_set_bit()
- * from fuse_request_end().
- */
- smp_mb();
- if (test_bit(FR_FINISHED, &req->flags)) {
- list_del_init(&req->intr_entry);
- spin_unlock(&fiq->lock);
- return 0;
- }
- fiq->ops->wake_interrupt_and_unlock(fiq);
- } else {
- spin_unlock(&fiq->lock);
- }
+ fiq->ops->send_interrupt(fiq, req);
+
return 0;
}
@@ -414,21 +469,15 @@ static void __fuse_request_send(struct fuse_req *req)
struct fuse_iqueue *fiq = &req->fm->fc->iq;
BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
- spin_lock(&fiq->lock);
- if (!fiq->connected) {
- spin_unlock(&fiq->lock);
- req->out.h.error = -ENOTCONN;
- } else {
- req->in.h.unique = fuse_get_unique(fiq);
- /* acquire extra reference, since request is still needed
- after fuse_request_end() */
- __fuse_get_request(req);
- queue_request_and_unlock(fiq, req);
- request_wait_answer(req);
- /* Pairs with smp_wmb() in fuse_request_end() */
- smp_rmb();
- }
+ /* acquire extra reference, since request is still needed after
+ fuse_request_end() */
+ __fuse_get_request(req);
+ fuse_send_one(fiq, req);
+
+ request_wait_answer(req);
+ /* Pairs with smp_wmb() in fuse_request_end() */
+ smp_rmb();
}
static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
@@ -468,8 +517,14 @@ static void fuse_force_creds(struct fuse_req *req)
{
struct fuse_conn *fc = req->fm->fc;
- req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
- req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ if (!req->fm->sb || req->fm->sb->s_iflags & SB_I_NOIDMAP) {
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ } else {
+ req->in.h.uid = FUSE_INVALID_UIDGID;
+ req->in.h.gid = FUSE_INVALID_UIDGID;
+ }
+
req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}
@@ -484,7 +539,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
__set_bit(FR_ASYNC, &req->flags);
}
-ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
+ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args)
{
struct fuse_conn *fc = fm->fc;
struct fuse_req *req;
@@ -501,7 +558,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
__set_bit(FR_FORCE, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fm, false);
+ req = fuse_get_req(idmap, fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -562,7 +619,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
__set_bit(FR_BACKGROUND, &req->flags);
} else {
WARN_ON(args->nocreds);
- req = fuse_get_req(fm, true);
+ req = fuse_get_req(&invalid_mnt_idmap, fm, true);
if (IS_ERR(req))
return PTR_ERR(req);
}
@@ -583,9 +640,8 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
{
struct fuse_req *req;
struct fuse_iqueue *fiq = &fm->fc->iq;
- int err = 0;
- req = fuse_get_req(fm, false);
+ req = fuse_get_req(&invalid_mnt_idmap, fm, false);
if (IS_ERR(req))
return PTR_ERR(req);
@@ -594,16 +650,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm,
fuse_args_to_req(req, args);
- spin_lock(&fiq->lock);
- if (fiq->connected) {
- queue_request_and_unlock(fiq, req);
- } else {
- err = -ENODEV;
- spin_unlock(&fiq->lock);
- fuse_put_request(req);
- }
+ fuse_send_one(fiq, req);
- return err;
+ return 0;
}
/*
@@ -1075,9 +1124,9 @@ __releases(fiq->lock)
return err ? err : reqsize;
}
-struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
- unsigned int max,
- unsigned int *countp)
+static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
+ unsigned int max,
+ unsigned int *countp)
{
struct fuse_forget_link *head = fiq->forget_list_head.next;
struct fuse_forget_link **newhead = &head;
@@ -1096,7 +1145,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
return head;
}
-EXPORT_SYMBOL(fuse_dequeue_forget);
static int fuse_read_single_forget(struct fuse_iqueue *fiq,
struct fuse_copy_state *cs,
@@ -1111,7 +1159,7 @@ __releases(fiq->lock)
struct fuse_in_header ih = {
.opcode = FUSE_FORGET,
.nodeid = forget->forget_one.nodeid,
- .unique = fuse_get_unique(fiq),
+ .unique = fuse_get_unique_locked(fiq),
.len = sizeof(ih) + sizeof(arg),
};
@@ -1142,7 +1190,7 @@ __releases(fiq->lock)
struct fuse_batch_forget_in arg = { .count = 0 };
struct fuse_in_header ih = {
.opcode = FUSE_BATCH_FORGET,
- .unique = fuse_get_unique(fiq),
+ .unique = fuse_get_unique_locked(fiq),
.len = sizeof(ih) + sizeof(arg),
};
@@ -1830,7 +1878,7 @@ static void fuse_resend(struct fuse_conn *fc)
}
/* iq and pq requests are both oldest to newest */
list_splice(&to_queue, &fiq->pending);
- fiq->ops->wake_pending_and_unlock(fiq);
+ fuse_dev_wake_and_unlock(fiq);
}
static int fuse_notify_resend(struct fuse_conn *fc)
@@ -2329,15 +2377,15 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
return -EFAULT;
f = fdget(oldfd);
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
/*
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (f.file->f_op == file->f_op)
- fud = fuse_get_dev(f.file);
+ if (fd_file(f)->f_op == file->f_op)
+ fud = fuse_get_dev(fd_file(f));
res = -EINVAL;
if (fud) {
@@ -2408,7 +2456,6 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
- .llseek = no_llseek,
.read_iter = fuse_dev_read,
.splice_read = fuse_dev_splice_read,
.write_iter = fuse_dev_write,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8e96df9fd76c..54104dd48af7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -545,17 +545,21 @@ static u32 fuse_ext_size(size_t size)
/*
* This adds just a single supplementary group that matches the parent's group.
*/
-static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext)
+static int get_create_supp_group(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct fuse_in_arg *ext)
{
struct fuse_conn *fc = get_fuse_conn(dir);
struct fuse_ext_header *xh;
struct fuse_supp_groups *sg;
kgid_t kgid = dir->i_gid;
+ vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid);
gid_t parent_gid = from_kgid(fc->user_ns, kgid);
+
u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0]));
- if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) ||
- !in_group_p(kgid))
+ if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) ||
+ !vfsgid_in_group_p(vfsgid))
return 0;
xh = extend_arg(ext, sg_len);
@@ -572,7 +576,8 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext)
return 0;
}
-static int get_create_ext(struct fuse_args *args,
+static int get_create_ext(struct mnt_idmap *idmap,
+ struct fuse_args *args,
struct inode *dir, struct dentry *dentry,
umode_t mode)
{
@@ -583,7 +588,7 @@ static int get_create_ext(struct fuse_args *args,
if (fc->init_security)
err = get_security_context(dentry, mode, &ext);
if (!err && fc->create_supp_group)
- err = get_create_supp_group(dir, &ext);
+ err = get_create_supp_group(idmap, dir, &ext);
if (!err && ext.size) {
WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args));
@@ -609,9 +614,9 @@ static void free_ext_value(struct fuse_args *args)
* If the filesystem doesn't support this, then fall back to separate
* 'mknod' + 'open' requests.
*/
-static int fuse_create_open(struct inode *dir, struct dentry *entry,
- struct file *file, unsigned int flags,
- umode_t mode, u32 opcode)
+static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, struct file *file,
+ unsigned int flags, umode_t mode, u32 opcode)
{
int err;
struct inode *inode;
@@ -668,11 +673,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.out_args[1].size = sizeof(*outopenp);
args.out_args[1].value = outopenp;
- err = get_create_ext(&args, dir, entry, mode);
+ err = get_create_ext(idmap, &args, dir, entry, mode);
if (err)
goto out_free_ff;
- err = fuse_simple_request(fm, &args);
+ err = fuse_simple_idmap_request(idmap, fm, &args);
free_ext_value(&args);
if (err)
goto out_free_ff;
@@ -729,6 +734,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
umode_t mode)
{
int err;
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
struct fuse_conn *fc = get_fuse_conn(dir);
struct dentry *res = NULL;
@@ -753,7 +759,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (fc->no_create)
goto mknod;
- err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE);
+ err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE);
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
@@ -764,7 +770,7 @@ out_dput:
return err;
mknod:
- err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0);
+ err = fuse_mknod(idmap, dir, entry, mode, 0);
if (err)
goto out_dput;
no_open:
@@ -774,9 +780,9 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
- struct inode *dir, struct dentry *entry,
- umode_t mode)
+static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
@@ -798,12 +804,12 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
args->out_args[0].value = &outarg;
if (args->opcode != FUSE_LINK) {
- err = get_create_ext(args, dir, entry, mode);
+ err = get_create_ext(idmap, args, dir, entry, mode);
if (err)
goto out_put_forget_req;
}
- err = fuse_simple_request(fm, args);
+ err = fuse_simple_idmap_request(idmap, fm, args);
free_ext_value(args);
if (err)
goto out_put_forget_req;
@@ -864,13 +870,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fm, &args, dir, entry, mode);
+ return create_new_entry(idmap, fm, &args, dir, entry, mode);
}
static int fuse_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *entry, umode_t mode, bool excl)
{
- return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0);
+ return fuse_mknod(idmap, dir, entry, mode, 0);
}
static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
@@ -882,7 +888,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
if (fc->no_tmpfile)
return -EOPNOTSUPP;
- err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE);
+ err = fuse_create_open(idmap, dir, file->f_path.dentry, file,
+ file->f_flags, mode, FUSE_TMPFILE);
if (err == -ENOSYS) {
fc->no_tmpfile = 1;
err = -EOPNOTSUPP;
@@ -909,7 +916,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(fm, &args, dir, entry, S_IFDIR);
+ return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR);
}
static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
@@ -925,7 +932,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = entry->d_name.name;
args.in_args[1].size = len;
args.in_args[1].value = link;
- return create_new_entry(fm, &args, dir, entry, S_IFLNK);
+ return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
}
void fuse_flush_time_update(struct inode *inode)
@@ -1019,7 +1026,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
return err;
}
-static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent,
struct inode *newdir, struct dentry *newent,
unsigned int flags, int opcode, size_t argsize)
{
@@ -1040,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
args.in_args[1].value = oldent->d_name.name;
args.in_args[2].size = newent->d_name.len + 1;
args.in_args[2].value = newent->d_name.name;
- err = fuse_simple_request(fm, &args);
+ err = fuse_simple_idmap_request(idmap, fm, &args);
if (!err) {
/* ctime changes */
fuse_update_ctime(d_inode(oldent));
@@ -1086,7 +1093,8 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir,
if (fc->no_rename2 || fc->minor < 23)
return -EINVAL;
- err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap,
+ olddir, oldent, newdir, newent, flags,
FUSE_RENAME2,
sizeof(struct fuse_rename2_in));
if (err == -ENOSYS) {
@@ -1094,7 +1102,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir,
err = -EINVAL;
}
} else {
- err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0,
FUSE_RENAME,
sizeof(struct fuse_rename_in));
}
@@ -1119,7 +1127,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(fm, &args, newdir, newent, inode->i_mode);
+ err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
if (!err)
fuse_update_ctime_in_cache(inode);
else if (err == -EINTR)
@@ -1128,18 +1136,22 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
return err;
}
-static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
- struct kstat *stat)
+static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct fuse_attr *attr, struct kstat *stat)
{
unsigned int blkbits;
struct fuse_conn *fc = get_fuse_conn(inode);
+ vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns,
+ make_kuid(fc->user_ns, attr->uid));
+ vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns,
+ make_kgid(fc->user_ns, attr->gid));
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
stat->nlink = attr->nlink;
- stat->uid = make_kuid(fc->user_ns, attr->uid);
- stat->gid = make_kgid(fc->user_ns, attr->gid);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->atime.tv_sec = attr->atime;
stat->atime.tv_nsec = attr->atimensec;
@@ -1178,8 +1190,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr)
attr->blksize = sx->blksize;
}
-static int fuse_do_statx(struct inode *inode, struct file *file,
- struct kstat *stat)
+static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode,
+ struct file *file, struct kstat *stat)
{
int err;
struct fuse_attr attr;
@@ -1232,15 +1244,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file,
stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME);
stat->btime.tv_sec = sx->btime.tv_sec;
stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1);
- fuse_fillattr(inode, &attr, stat);
+ fuse_fillattr(idmap, inode, &attr, stat);
stat->result_mask |= STATX_TYPE;
}
return 0;
}
-static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
- struct file *file)
+static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode,
+ struct kstat *stat, struct file *file)
{
int err;
struct fuse_getattr_in inarg;
@@ -1279,15 +1291,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
ATTR_TIMEOUT(&outarg),
attr_version);
if (stat)
- fuse_fillattr(inode, &outarg.attr, stat);
+ fuse_fillattr(idmap, inode, &outarg.attr, stat);
}
}
return err;
}
-static int fuse_update_get_attr(struct inode *inode, struct file *file,
- struct kstat *stat, u32 request_mask,
- unsigned int flags)
+static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode,
+ struct file *file, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1318,17 +1330,17 @@ retry:
forget_all_cached_acls(inode);
/* Try statx if BTIME is requested */
if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) {
- err = fuse_do_statx(inode, file, stat);
+ err = fuse_do_statx(idmap, inode, file, stat);
if (err == -ENOSYS) {
fc->no_statx = 1;
err = 0;
goto retry;
}
} else {
- err = fuse_do_getattr(inode, stat, file);
+ err = fuse_do_getattr(idmap, inode, stat, file);
}
} else if (stat) {
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
if (test_bit(FUSE_I_BTIME, &fi->state)) {
@@ -1342,7 +1354,7 @@ retry:
int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
{
- return fuse_update_get_attr(inode, file, NULL, mask, 0);
+ return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0);
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1462,6 +1474,14 @@ static int fuse_access(struct inode *inode, int mask)
BUG_ON(mask & MAY_NOT_BLOCK);
+ /*
+ * We should not send FUSE_ACCESS to the userspace
+ * when idmapped mounts are enabled as for this case
+ * we have fc->default_permissions = 1 and access
+ * permission checks are done on the kernel side.
+ */
+ WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP));
+
if (fm->fc->no_access)
return 0;
@@ -1486,7 +1506,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
return -ECHILD;
forget_all_cached_acls(inode);
- return fuse_do_getattr(inode, NULL, NULL);
+ return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL);
}
/*
@@ -1534,7 +1554,7 @@ static int fuse_permission(struct mnt_idmap *idmap,
}
if (fc->default_permissions) {
- err = generic_permission(&nop_mnt_idmap, inode, mask);
+ err = generic_permission(idmap, inode, mask);
/* If permission is denied, try to refresh file
attributes. This is also needed, because the root
@@ -1542,7 +1562,7 @@ static int fuse_permission(struct mnt_idmap *idmap,
if (err == -EACCES && !refreshed) {
err = fuse_perm_getattr(inode, mask);
if (!err)
- err = generic_permission(&nop_mnt_idmap,
+ err = generic_permission(idmap,
inode, mask);
}
@@ -1738,17 +1758,29 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
return true;
}
-static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
- struct fuse_setattr_in *arg, bool trust_local_cmtime)
+static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc,
+ struct iattr *iattr, struct fuse_setattr_in *arg,
+ bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
if (ivalid & ATTR_MODE)
arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
- if (ivalid & ATTR_UID)
- arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
- if (ivalid & ATTR_GID)
- arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+
+ if (ivalid & ATTR_UID) {
+ kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid);
+
+ arg->valid |= FATTR_UID;
+ arg->uid = from_kuid(fc->user_ns, fsuid);
+ }
+
+ if (ivalid & ATTR_GID) {
+ kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid);
+
+ arg->valid |= FATTR_GID;
+ arg->gid = from_kgid(fc->user_ns, fsgid);
+ }
+
if (ivalid & ATTR_SIZE)
arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
if (ivalid & ATTR_ATIME) {
@@ -1868,8 +1900,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
* vmtruncate() doesn't allow for this case, so do the rlimit checking
* and the actual truncation by hand.
*/
-int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
- struct file *file)
+int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr, struct file *file)
{
struct inode *inode = d_inode(dentry);
struct fuse_mount *fm = get_fuse_mount(inode);
@@ -1889,7 +1921,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
- err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ err = setattr_prepare(idmap, dentry, attr);
if (err)
return err;
@@ -1948,7 +1980,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
+ iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -2065,7 +2097,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry,
* ia_mode calculation may have used stale i_mode.
* Refresh and recalculate.
*/
- ret = fuse_do_getattr(inode, NULL, file);
+ ret = fuse_do_getattr(idmap, inode, NULL, file);
if (ret)
return ret;
@@ -2083,7 +2115,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry,
if (!attr->ia_valid)
return 0;
- ret = fuse_do_setattr(entry, attr, file);
+ ret = fuse_do_setattr(idmap, entry, attr, file);
if (!ret) {
/*
* If filesystem supports acls it may have updated acl xattrs in
@@ -2122,7 +2154,7 @@ static int fuse_getattr(struct mnt_idmap *idmap,
return -EACCES;
}
- return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
+ return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags);
}
static const struct inode_operations fuse_dir_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ba6df52a823e..f33fbce86ae0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
/*
* Check if any page in a range is under writeback
- *
- * This is currently done by walking the list of writepage requests
- * for the inode, which can be pretty inefficient.
*/
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
pgoff_t idx_to)
@@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
struct fuse_inode *fi = get_fuse_inode(inode);
bool found;
+ if (RB_EMPTY_ROOT(&fi->writepages))
+ return false;
+
spin_lock(&fi->lock);
found = fuse_find_writeback(fi, idx_from, idx_to);
spin_unlock(&fi->lock);
@@ -1345,7 +1345,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from
/* shared locks are not allowed with parallel page cache IO */
if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
- return false;
+ return true;
/* Parallel dio beyond EOF is not supported, at least for now. */
if (fuse_io_past_eof(iocb, from))
@@ -1398,6 +1398,7 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
struct inode *inode = mapping->host;
@@ -1412,7 +1413,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- setattr_should_drop_suidgid(&nop_mnt_idmap,
+ setattr_should_drop_suidgid(idmap,
file_inode(file))) {
goto writethrough;
}
@@ -1762,27 +1763,31 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
for (i = 0; i < ap->num_pages; i++)
__free_page(ap->pages[i]);
- if (wpa->ia.ff)
- fuse_file_put(wpa->ia.ff, false);
+ fuse_file_put(wpa->ia.ff, false);
kfree(ap->pages);
kfree(wpa);
}
-static void fuse_writepage_finish(struct fuse_mount *fm,
- struct fuse_writepage_args *wpa)
+static void fuse_writepage_finish_stat(struct inode *inode, struct page *page)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ dec_node_page_state(page, NR_WRITEBACK_TEMP);
+ wb_writeout_inc(&bdi->wb);
+}
+
+static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_pages; i++) {
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
- }
+ for (i = 0; i < ap->num_pages; i++)
+ fuse_writepage_finish_stat(inode, ap->pages[i]);
+
wake_up(&fi->page_waitq);
}
@@ -1829,19 +1834,14 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
- fuse_writepage_finish(fm, wpa);
+ fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
/* After rb_erase() aux request list is private */
for (aux = wpa->next; aux; aux = next) {
- struct backing_dev_info *bdi = inode_to_bdi(aux->inode);
-
next = aux->next;
aux->next = NULL;
-
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
+ fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]);
fuse_writepage_free(aux);
}
@@ -1936,7 +1936,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
wpa->next = next->next;
next->next = NULL;
- next->ia.ff = fuse_file_get(wpa->ia.ff);
tree_insert(&fi->writepages, next);
/*
@@ -1965,7 +1964,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fi->writectr--;
- fuse_writepage_finish(fm, wpa);
+ fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
@@ -2049,49 +2048,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
rcu_read_unlock();
}
+static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
+ struct folio *tmp_folio, uint32_t page_index)
+{
+ struct inode *inode = folio->mapping->host;
+ struct fuse_args_pages *ap = &wpa->ia.ap;
+
+ folio_copy(tmp_folio, folio);
+
+ ap->pages[page_index] = &tmp_folio->page;
+ ap->descs[page_index].offset = 0;
+ ap->descs[page_index].length = PAGE_SIZE;
+
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP);
+}
+
+static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ struct fuse_file *ff)
+{
+ struct inode *inode = folio->mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_writepage_args *wpa;
+ struct fuse_args_pages *ap;
+
+ wpa = fuse_writepage_args_alloc();
+ if (!wpa)
+ return NULL;
+
+ fuse_writepage_add_to_bucket(fc, wpa);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
+ wpa->inode = inode;
+ wpa->ia.ff = ff;
+
+ ap = &wpa->ia.ap;
+ ap->args.in_pages = true;
+ ap->args.end = fuse_writepage_end;
+
+ return wpa;
+}
+
static int fuse_writepage_locked(struct folio *folio)
{
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
- struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
struct folio *tmp_folio;
+ struct fuse_file *ff;
int error = -ENOMEM;
- folio_start_writeback(folio);
-
- wpa = fuse_writepage_args_alloc();
- if (!wpa)
- goto err;
- ap = &wpa->ia.ap;
-
tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
if (!tmp_folio)
- goto err_free;
+ goto err;
error = -EIO;
- wpa->ia.ff = fuse_write_file_get(fi);
- if (!wpa->ia.ff)
+ ff = fuse_write_file_get(fi);
+ if (!ff)
goto err_nofile;
- fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
+ wpa = fuse_writepage_args_setup(folio, ff);
+ error = -ENOMEM;
+ if (!wpa)
+ goto err_writepage_args;
- folio_copy(tmp_folio, folio);
- wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
- wpa->next = NULL;
- ap->args.in_pages = true;
+ ap = &wpa->ia.ap;
ap->num_pages = 1;
- ap->pages[0] = &tmp_folio->page;
- ap->descs[0].offset = 0;
- ap->descs[0].length = PAGE_SIZE;
- ap->args.end = fuse_writepage_end;
- wpa->inode = inode;
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
+ folio_start_writeback(folio);
+ fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
spin_lock(&fi->lock);
tree_insert(&fi->writepages, wpa);
@@ -2103,13 +2130,12 @@ static int fuse_writepage_locked(struct folio *folio)
return 0;
+err_writepage_args:
+ fuse_file_put(ff, false);
err_nofile:
folio_put(tmp_folio);
-err_free:
- kfree(wpa);
err:
mapping_set_error(folio->mapping, error);
- folio_end_writeback(folio);
return error;
}
@@ -2155,7 +2181,6 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
int num_pages = wpa->ia.ap.num_pages;
int i;
- wpa->ia.ff = fuse_file_get(data->ff);
spin_lock(&fi->lock);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
@@ -2210,11 +2235,7 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
spin_unlock(&fi->lock);
if (tmp) {
- struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
-
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
+ fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]);
fuse_writepage_free(new_wpa);
}
@@ -2264,24 +2285,17 @@ static int fuse_writepages_fill(struct folio *folio,
struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct page *tmp_page;
+ struct folio *tmp_folio;
int err;
- if (!data->ff) {
- err = -EIO;
- data->ff = fuse_write_file_get(fi);
- if (!data->ff)
- goto out_unlock;
- }
-
if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
}
err = -ENOMEM;
- tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
- if (!tmp_page)
+ tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
+ if (!tmp_folio)
goto out_unlock;
/*
@@ -2299,35 +2313,20 @@ static int fuse_writepages_fill(struct folio *folio,
*/
if (data->wpa == NULL) {
err = -ENOMEM;
- wpa = fuse_writepage_args_alloc();
+ wpa = fuse_writepage_args_setup(folio, data->ff);
if (!wpa) {
- __free_page(tmp_page);
+ folio_put(tmp_folio);
goto out_unlock;
}
- fuse_writepage_add_to_bucket(fc, wpa);
-
+ fuse_file_get(wpa->ia.ff);
data->max_pages = 1;
-
ap = &wpa->ia.ap;
- fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0);
- wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
- wpa->next = NULL;
- ap->args.in_pages = true;
- ap->args.end = fuse_writepage_end;
- ap->num_pages = 0;
- wpa->inode = inode;
}
folio_start_writeback(folio);
- copy_highpage(tmp_page, &folio->page);
- ap->pages[ap->num_pages] = tmp_page;
- ap->descs[ap->num_pages].offset = 0;
- ap->descs[ap->num_pages].length = PAGE_SIZE;
+ fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages);
data->orig_pages[ap->num_pages] = &folio->page;
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
-
err = 0;
if (data->wpa) {
/*
@@ -2352,13 +2351,13 @@ static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
- err = -EIO;
if (fuse_is_bad(inode))
- goto out;
+ return -EIO;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
@@ -2366,7 +2365,9 @@ static int fuse_writepages(struct address_space *mapping,
data.inode = inode;
data.wpa = NULL;
- data.ff = NULL;
+ data.ff = fuse_write_file_get(fi);
+ if (!data.ff)
+ return -EIO;
err = -ENOMEM;
data.orig_pages = kcalloc(fc->max_pages,
@@ -2380,11 +2381,10 @@ static int fuse_writepages(struct address_space *mapping,
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
}
- if (data.ff)
- fuse_file_put(data.ff, false);
kfree(data.orig_pages);
out:
+ fuse_file_put(data.ff, false);
return err;
}
@@ -2973,7 +2973,7 @@ static void fuse_do_truncate(struct file *file)
attr.ia_file = file;
attr.ia_valid |= ATTR_FILE;
- fuse_do_setattr(file_dentry(file), &attr, file);
+ fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file);
}
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f23919610313..e6cc3d552b13 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -449,22 +449,19 @@ struct fuse_iqueue;
*/
struct fuse_iqueue_ops {
/**
- * Signal that a forget has been queued
+ * Send one forget
*/
- void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link);
/**
- * Signal that an INTERRUPT request has been queued
+ * Send interrupt for request
*/
- void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req);
/**
- * Signal that a request has been queued
+ * Send one request
*/
- void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
- __releases(fiq->lock);
+ void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req);
/**
* Clean up when fuse_iqueue is destroyed
@@ -869,7 +866,7 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Entry on the fuse_mount_list */
+ /** Entry on the fuse_conn_list */
struct list_head entry;
/** Device ID from the root super block */
@@ -1053,10 +1050,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
struct fuse_forget_link *fuse_alloc_forget(void);
-struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
- unsigned int max,
- unsigned int *countp);
-
/*
* Initialize READ or READDIR request
*/
@@ -1154,7 +1147,22 @@ void __exit fuse_ctl_cleanup(void);
/**
* Simple request sending that does request allocation and freeing
*/
-ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args);
+ssize_t __fuse_simple_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args);
+
+static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
+{
+ return __fuse_simple_request(&invalid_mnt_idmap, fm, args);
+}
+
+static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap,
+ struct fuse_mount *fm,
+ struct fuse_args *args)
+{
+ return __fuse_simple_request(idmap, fm, args);
+}
+
int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
gfp_t gfp_flags);
@@ -1330,8 +1338,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written);
int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
-int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
- struct file *file);
+int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr, struct file *file);
void fuse_set_initialized(struct fuse_conn *fc);
diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h
new file mode 100644
index 000000000000..bbe9ddd8c716
--- /dev/null
+++ b/fs/fuse/fuse_trace.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fuse
+
+#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FUSE_H
+
+#include <linux/tracepoint.h>
+
+#define OPCODES \
+ EM( FUSE_LOOKUP, "FUSE_LOOKUP") \
+ EM( FUSE_FORGET, "FUSE_FORGET") \
+ EM( FUSE_GETATTR, "FUSE_GETATTR") \
+ EM( FUSE_SETATTR, "FUSE_SETATTR") \
+ EM( FUSE_READLINK, "FUSE_READLINK") \
+ EM( FUSE_SYMLINK, "FUSE_SYMLINK") \
+ EM( FUSE_MKNOD, "FUSE_MKNOD") \
+ EM( FUSE_MKDIR, "FUSE_MKDIR") \
+ EM( FUSE_UNLINK, "FUSE_UNLINK") \
+ EM( FUSE_RMDIR, "FUSE_RMDIR") \
+ EM( FUSE_RENAME, "FUSE_RENAME") \
+ EM( FUSE_LINK, "FUSE_LINK") \
+ EM( FUSE_OPEN, "FUSE_OPEN") \
+ EM( FUSE_READ, "FUSE_READ") \
+ EM( FUSE_WRITE, "FUSE_WRITE") \
+ EM( FUSE_STATFS, "FUSE_STATFS") \
+ EM( FUSE_RELEASE, "FUSE_RELEASE") \
+ EM( FUSE_FSYNC, "FUSE_FSYNC") \
+ EM( FUSE_SETXATTR, "FUSE_SETXATTR") \
+ EM( FUSE_GETXATTR, "FUSE_GETXATTR") \
+ EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \
+ EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \
+ EM( FUSE_FLUSH, "FUSE_FLUSH") \
+ EM( FUSE_INIT, "FUSE_INIT") \
+ EM( FUSE_OPENDIR, "FUSE_OPENDIR") \
+ EM( FUSE_READDIR, "FUSE_READDIR") \
+ EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \
+ EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \
+ EM( FUSE_GETLK, "FUSE_GETLK") \
+ EM( FUSE_SETLK, "FUSE_SETLK") \
+ EM( FUSE_SETLKW, "FUSE_SETLKW") \
+ EM( FUSE_ACCESS, "FUSE_ACCESS") \
+ EM( FUSE_CREATE, "FUSE_CREATE") \
+ EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \
+ EM( FUSE_BMAP, "FUSE_BMAP") \
+ EM( FUSE_DESTROY, "FUSE_DESTROY") \
+ EM( FUSE_IOCTL, "FUSE_IOCTL") \
+ EM( FUSE_POLL, "FUSE_POLL") \
+ EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \
+ EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \
+ EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \
+ EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \
+ EM( FUSE_RENAME2, "FUSE_RENAME2") \
+ EM( FUSE_LSEEK, "FUSE_LSEEK") \
+ EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \
+ EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \
+ EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \
+ EM( FUSE_SYNCFS, "FUSE_SYNCFS") \
+ EM( FUSE_TMPFILE, "FUSE_TMPFILE") \
+ EM( FUSE_STATX, "FUSE_STATX") \
+ EMe(CUSE_INIT, "CUSE_INIT")
+
+/*
+ * This will turn the above table into TRACE_DEFINE_ENUM() for each of the
+ * entries.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+OPCODES
+
+/* Now we redfine it with the table that __print_symbolic needs. */
+#undef EM
+#undef EMe
+#define EM(a, b) {a, b},
+#define EMe(a, b) {a, b}
+
+TRACE_EVENT(fuse_request_send,
+ TP_PROTO(const struct fuse_req *req),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, connection)
+ __field(uint64_t, unique)
+ __field(enum fuse_opcode, opcode)
+ __field(uint32_t, len)
+ ),
+
+ TP_fast_assign(
+ __entry->connection = req->fm->fc->dev;
+ __entry->unique = req->in.h.unique;
+ __entry->opcode = req->in.h.opcode;
+ __entry->len = req->in.h.len;
+ ),
+
+ TP_printk("connection %u req %llu opcode %u (%s) len %u ",
+ __entry->connection, __entry->unique, __entry->opcode,
+ __print_symbolic(__entry->opcode, OPCODES), __entry->len)
+);
+
+TRACE_EVENT(fuse_request_end,
+ TP_PROTO(const struct fuse_req *req),
+
+ TP_ARGS(req),
+
+ TP_STRUCT__entry(
+ __field(dev_t, connection)
+ __field(uint64_t, unique)
+ __field(uint32_t, len)
+ __field(int32_t, error)
+ ),
+
+ TP_fast_assign(
+ __entry->connection = req->fm->fc->dev;
+ __entry->unique = req->in.h.unique;
+ __entry->len = req->out.h.len;
+ __entry->error = req->out.h.error;
+ ),
+
+ TP_printk("connection %u req %llu len %u error %d", __entry->connection,
+ __entry->unique, __entry->len, __entry->error)
+);
+
+#endif /* _TRACE_FUSE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE fuse_trace
+#include <trace/define_trace.h>
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index bebd89002328..fd3321e29a3e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1348,6 +1348,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
}
if (flags & FUSE_NO_EXPORT_SUPPORT)
fm->sb->s_export_op = &fuse_export_fid_operations;
+ if (flags & FUSE_ALLOW_IDMAP) {
+ if (fc->default_permissions)
+ fm->sb->s_iflags &= ~SB_I_NOIDMAP;
+ else
+ ok = false;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1395,7 +1401,7 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
- FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND;
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1572,6 +1578,7 @@ static void fuse_sb_defaults(struct super_block *sb)
sb->s_time_gran = 1;
sb->s_export_op = &fuse_export_operations;
sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+ sb->s_iflags |= SB_I_NOIDMAP;
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
@@ -1984,7 +1991,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
- .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
.init_fs_context = fuse_init_fs_context,
.parameters = fuse_fs_parameters,
.kill_sb = fuse_kill_sb_anon,
@@ -2005,7 +2012,7 @@ static struct file_system_type fuseblk_fs_type = {
.init_fs_context = fuse_init_fs_context,
.parameters = fuse_fs_parameters,
.kill_sb = fuse_kill_sb_blk,
- .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("fuseblk");
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 9666d13884ce..62aee8289d11 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -228,16 +228,13 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
if (map->flags || map->padding)
goto out;
- file = fget(map->fd);
+ file = fget_raw(map->fd);
res = -EBADF;
if (!file)
goto out;
- res = -EOPNOTSUPP;
- if (!file->f_op->read_iter || !file->f_op->write_iter)
- goto out_fput;
-
backing_sb = file_inode(file)->i_sb;
+ pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth);
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
goto out_fput;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index dd5260141615..6404a189e989 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -56,12 +56,14 @@ struct virtio_fs_vq {
bool connected;
long in_flight;
struct completion in_flight_zero; /* No inflight requests */
+ struct kobject *kobj;
char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
/* A virtio-fs device instance */
struct virtio_fs {
struct kobject kobj;
+ struct kobject *mqs_kobj;
struct list_head list; /* on virtio_fs_instances */
char *tag;
struct virtio_fs_vq *vqs;
@@ -200,19 +202,94 @@ static const struct kobj_type virtio_fs_ktype = {
.default_groups = virtio_fs_groups,
};
+static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs,
+ struct kobject *kobj)
+{
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ if (kobj == fs->vqs[i].kobj)
+ return &fs->vqs[i];
+ }
+ return NULL;
+}
+
+static ssize_t name_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj);
+ struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj);
+
+ if (!fsvq)
+ return -EINVAL;
+ return sysfs_emit(buf, "%s\n", fsvq->name);
+}
+
+static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name);
+
+static ssize_t cpu_list_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj);
+ struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj);
+ unsigned int cpu, qid;
+ const size_t size = PAGE_SIZE - 1;
+ bool first = true;
+ int ret = 0, pos = 0;
+
+ if (!fsvq)
+ return -EINVAL;
+
+ qid = fsvq->vq->index;
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid - VQ_REQUEST)) {
+ if (first)
+ ret = snprintf(buf + pos, size - pos, "%u", cpu);
+ else
+ ret = snprintf(buf + pos, size - pos, ", %u", cpu);
+
+ if (ret >= size - pos)
+ break;
+ first = false;
+ pos += ret;
+ }
+ }
+ ret = snprintf(buf + pos, size + 1 - pos, "\n");
+ return pos + ret;
+}
+
+static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list);
+
+static struct attribute *virtio_fs_vq_attrs[] = {
+ &virtio_fs_vq_name_attr.attr,
+ &virtio_fs_vq_cpu_list_attr.attr,
+ NULL
+};
+
+static struct attribute_group virtio_fs_vq_attr_group = {
+ .attrs = virtio_fs_vq_attrs,
+};
+
/* Make sure virtiofs_mutex is held */
-static void virtio_fs_put(struct virtio_fs *fs)
+static void virtio_fs_put_locked(struct virtio_fs *fs)
{
+ lockdep_assert_held(&virtio_fs_mutex);
+
kobject_put(&fs->kobj);
}
+static void virtio_fs_put(struct virtio_fs *fs)
+{
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put_locked(fs);
+ mutex_unlock(&virtio_fs_mutex);
+}
+
static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
{
struct virtio_fs *vfs = fiq->priv;
- mutex_lock(&virtio_fs_mutex);
virtio_fs_put(vfs);
- mutex_unlock(&virtio_fs_mutex);
}
static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
@@ -273,6 +350,50 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs)
}
}
+static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ int i;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+ kobject_put(fsvq->kobj);
+ }
+}
+
+static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
+{
+ struct virtio_fs_vq *fsvq;
+ char buff[12];
+ int i, j, ret;
+
+ for (i = 0; i < fs->nvqs; i++) {
+ fsvq = &fs->vqs[i];
+
+ sprintf(buff, "%d", i);
+ fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
+ if (!fs->mqs_kobj) {
+ ret = -ENOMEM;
+ goto out_del;
+ }
+
+ ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group);
+ if (ret) {
+ kobject_put(fsvq->kobj);
+ goto out_del;
+ }
+ }
+
+ return 0;
+
+out_del:
+ for (j = 0; j < i; j++) {
+ fsvq = &fs->vqs[j];
+ kobject_put(fsvq->kobj);
+ }
+ return ret;
+}
+
/* Add a new instance to the list or return -EEXIST if tag name exists*/
static int virtio_fs_add_instance(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -296,17 +417,22 @@ static int virtio_fs_add_instance(struct virtio_device *vdev,
*/
fs->kobj.kset = virtio_fs_kset;
ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
- if (ret < 0) {
- mutex_unlock(&virtio_fs_mutex);
- return ret;
+ if (ret < 0)
+ goto out_unlock;
+
+ fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj);
+ if (!fs->mqs_kobj) {
+ ret = -ENOMEM;
+ goto out_del;
}
ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
- if (ret < 0) {
- kobject_del(&fs->kobj);
- mutex_unlock(&virtio_fs_mutex);
- return ret;
- }
+ if (ret < 0)
+ goto out_put;
+
+ ret = virtio_fs_add_queues_sysfs(fs);
+ if (ret)
+ goto out_remove;
list_add_tail(&fs->list, &virtio_fs_instances);
@@ -315,6 +441,16 @@ static int virtio_fs_add_instance(struct virtio_device *vdev,
kobject_uevent(&fs->kobj, KOBJ_ADD);
return 0;
+
+out_remove:
+ sysfs_remove_link(&fs->kobj, "device");
+out_put:
+ kobject_put(fs->mqs_kobj);
+out_del:
+ kobject_del(&fs->kobj);
+out_unlock:
+ mutex_unlock(&virtio_fs_mutex);
+ return ret;
}
/* Return the virtio_fs with a given tag, or NULL */
@@ -1043,7 +1179,9 @@ static void virtio_fs_remove(struct virtio_device *vdev)
mutex_lock(&virtio_fs_mutex);
/* This device is going away. No one should get new reference */
list_del_init(&fs->list);
+ virtio_fs_delete_queues_sysfs(fs);
sysfs_remove_link(&fs->kobj, "device");
+ kobject_put(fs->mqs_kobj);
kobject_del(&fs->kobj);
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
@@ -1052,7 +1190,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
vdev->priv = NULL;
/* Put device reference on virtio_fs object */
- virtio_fs_put(fs);
+ virtio_fs_put_locked(fs);
mutex_unlock(&virtio_fs_mutex);
}
@@ -1091,22 +1229,13 @@ static struct virtio_driver virtio_fs_driver = {
#endif
};
-static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link)
{
- struct fuse_forget_link *link;
struct virtio_fs_forget *forget;
struct virtio_fs_forget_req *req;
- struct virtio_fs *fs;
- struct virtio_fs_vq *fsvq;
- u64 unique;
-
- link = fuse_dequeue_forget(fiq, 1, NULL);
- unique = fuse_get_unique(fiq);
-
- fs = fiq->priv;
- fsvq = &fs->vqs[VQ_HIPRIO];
- spin_unlock(&fiq->lock);
+ struct virtio_fs *fs = fiq->priv;
+ struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO];
+ u64 unique = fuse_get_unique(fiq);
/* Allocate a buffer for the request */
forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL);
@@ -1126,8 +1255,7 @@ __releases(fiq->lock)
kfree(link);
}
-static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
/*
* TODO interrupts.
@@ -1136,7 +1264,6 @@ __releases(fiq->lock)
* Exceptions are blocking lock operations; for example fcntl(F_SETLKW)
* with shared lock between host and guest.
*/
- spin_unlock(&fiq->lock);
}
/* Count number of scatter-gather elements required */
@@ -1341,21 +1468,17 @@ out:
return ret;
}
-static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
-__releases(fiq->lock)
+static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req)
{
unsigned int queue_id;
struct virtio_fs *fs;
- struct fuse_req *req;
struct virtio_fs_vq *fsvq;
int ret;
- WARN_ON(list_empty(&fiq->pending));
- req = list_last_entry(&fiq->pending, struct fuse_req, list);
+ if (req->in.h.opcode != FUSE_NOTIFY_REPLY)
+ req->in.h.unique = fuse_get_unique(fiq);
+
clear_bit(FR_PENDING, &req->flags);
- list_del_init(&req->list);
- WARN_ON(!list_empty(&fiq->pending));
- spin_unlock(&fiq->lock);
fs = fiq->priv;
queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()];
@@ -1393,10 +1516,10 @@ __releases(fiq->lock)
}
static const struct fuse_iqueue_ops virtio_fs_fiq_ops = {
- .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock,
- .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock,
- .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock,
- .release = virtio_fs_fiq_release,
+ .send_forget = virtio_fs_send_forget,
+ .send_interrupt = virtio_fs_send_interrupt,
+ .send_req = virtio_fs_send_req,
+ .release = virtio_fs_fiq_release,
};
static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx)
@@ -1596,9 +1719,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
out_err:
kfree(fc);
- mutex_lock(&virtio_fs_mutex);
virtio_fs_put(fs);
- mutex_unlock(&virtio_fs_mutex);
return err;
}
@@ -1628,6 +1749,7 @@ static struct file_system_type virtio_fs_type = {
.name = "virtiofs",
.init_fs_context = virtio_fs_init_fs_context,
.kill_sb = virtio_kill_sb,
+ .fs_flags = FS_ALLOW_IDMAP,
};
static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 10d5acd3f742..68fc8af14700 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -139,35 +139,6 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
}
/**
- * gfs2_jdata_writepage - Write complete page
- * @page: Page to write
- * @wbc: The writeback control
- *
- * Returns: errno
- *
- */
-
-static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
-
- if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE))
- goto out;
- if (folio_test_checked(folio) || current->journal_info)
- goto out_ignore;
- return __gfs2_jdata_write_folio(folio, wbc);
-
-out_ignore:
- folio_redirty_for_writepage(wbc, folio);
-out:
- folio_unlock(folio);
- return 0;
-}
-
-/**
* gfs2_writepages - Write a bunch of dirty pages back to disk
* @mapping: The mapping to write
* @wbc: Write-back control
@@ -748,7 +719,6 @@ static const struct address_space_operations gfs2_aops = {
};
static const struct address_space_operations gfs2_jdata_aops = {
- .writepage = gfs2_jdata_writepage,
.writepages = gfs2_jdata_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 08982937b5df..f7dd64856c9b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1057,7 +1057,7 @@ retry:
}
pagefault_disable();
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL);
pagefault_enable();
if (ret > 0)
written += ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12a769077ea0..269c3bc7fced 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1885,14 +1885,16 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
{
unsigned long delay = 0;
- unsigned long holdtime;
- unsigned long now = jiffies;
gfs2_glock_hold(gl);
spin_lock(&gl->gl_lockref.lock);
- holdtime = gl->gl_tchange + gl->gl_hold_time;
if (!list_empty(&gl->gl_holders) &&
gl->gl_name.ln_type == LM_TYPE_INODE) {
+ unsigned long now = jiffies;
+ unsigned long holdtime;
+
+ holdtime = gl->gl_tchange + gl->gl_hold_time;
+
if (time_before(now, holdtime))
delay = holdtime - now;
if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags))
@@ -2249,6 +2251,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
gfs2_free_dead_glocks(sdp);
glock_hash_walk(dump_glock_func, sdp);
destroy_workqueue(sdp->sd_glock_wq);
+ sdp->sd_glock_wq = NULL;
}
static const char *state2str(unsigned state)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6ee6013fb825..f9c5089783d2 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -80,15 +80,6 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
brelse(bd->bd_bh);
}
-static int __gfs2_writepage(struct folio *folio, struct writeback_control *wbc,
- void *data)
-{
- struct address_space *mapping = data;
- int ret = mapping->a_ops->writepage(&folio->page, wbc);
- mapping_set_error(mapping, ret);
- return ret;
-}
-
/**
* gfs2_ail1_start_one - Start I/O on a transaction
* @sdp: The superblock
@@ -140,7 +131,7 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping);
+ ret = mapping->a_ops->writepages(mapping, wbc);
if (need_resched()) {
blk_finish_plug(plug);
cond_resched();
@@ -149,6 +140,7 @@ __acquires(&sdp->sd_ail_lock)
spin_lock(&sdp->sd_ail_lock);
if (ret == -ENODATA) /* if a jdata write into a new hole */
ret = 0; /* ignore it */
+ mapping_set_error(mapping, ret);
if (ret || wbc->nr_to_write <= 0)
break;
return -EBUSY;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 2b26e8d529aa..fea3efcc2f93 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -30,9 +30,9 @@
#include "util.h"
#include "trace_gfs2.h"
-static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
+static void gfs2_aspace_write_folio(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
struct buffer_head *bh, *head;
int nr_underway = 0;
blk_opf_t write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
@@ -66,8 +66,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
} while ((bh = bh->b_this_page) != head);
/*
- * The page and its buffers are protected by PageWriteback(), so we can
- * drop the bh refcounts early.
+ * The folio and its buffers are protected from truncation by
+ * the writeback flag, so we can drop the bh refcounts early.
*/
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
@@ -84,21 +84,31 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
if (nr_underway == 0)
folio_end_writeback(folio);
+}
- return 0;
+static int gfs2_aspace_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ gfs2_aspace_write_folio(folio, wbc);
+
+ return error;
}
const struct address_space_operations gfs2_meta_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .writepage = gfs2_aspace_writepage,
+ .writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
};
const struct address_space_operations gfs2_rgrp_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
- .writepage = gfs2_aspace_writepage,
+ .writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
};
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff1f3e3dc65c..e83d293c3614 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1307,7 +1307,8 @@ fail_debug:
fail_delete_wq:
destroy_workqueue(sdp->sd_delete_wq);
fail_glock_wq:
- destroy_workqueue(sdp->sd_glock_wq);
+ if (sdp->sd_glock_wq)
+ destroy_workqueue(sdp->sd_glock_wq);
fail_free:
free_sbd(sdp);
sb->s_fs_info = NULL;
diff --git a/fs/inode.c b/fs/inode.c
index af78f515403f..471ae4a31549 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -439,14 +439,6 @@ static void init_once(void *foo)
}
/*
- * inode->i_lock must be held
- */
-void __iget(struct inode *inode)
-{
- atomic_inc(&inode->i_count);
-}
-
-/*
* get additional reference to inode; caller must already hold one.
*/
void ihold(struct inode *inode)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 64776891120c..6e0c954388d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -235,9 +235,9 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
loff_t cloned;
int ret;
- if (!src_file.file)
+ if (!fd_file(src_file))
return -EBADF;
- cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+ cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
olen, 0);
if (cloned < 0)
ret = cloned;
@@ -895,16 +895,16 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
struct fd f = fdget(fd);
int error;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl(fd_file(f), cmd, arg);
if (error)
goto out;
- error = do_vfs_ioctl(f.file, fd, cmd, arg);
+ error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
if (error == -ENOIOCTLCMD)
- error = vfs_ioctl(f.file, cmd, arg);
+ error = vfs_ioctl(fd_file(f), cmd, arg);
out:
fdput(f);
@@ -953,32 +953,32 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
struct fd f = fdget(fd);
int error;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = security_file_ioctl_compat(f.file, cmd, arg);
+ error = security_file_ioctl_compat(fd_file(f), cmd, arg);
if (error)
goto out;
switch (cmd) {
/* FICLONE takes an int argument, so don't use compat_ptr() */
case FICLONE:
- error = ioctl_file_clone(f.file, arg, 0, 0, 0);
+ error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0);
break;
#if defined(CONFIG_X86_64)
/* these get messy on amd64 due to alignment differences */
case FS_IOC_RESVSP_32:
case FS_IOC_RESVSP64_32:
- error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
+ error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg));
break;
case FS_IOC_UNRESVSP_32:
case FS_IOC_UNRESVSP64_32:
- error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
+ error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE,
compat_ptr(arg));
break;
case FS_IOC_ZERO_RANGE_32:
- error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
+ error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE,
compat_ptr(arg));
break;
#endif
@@ -998,13 +998,13 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
* argument.
*/
default:
- error = do_vfs_ioctl(f.file, fd, cmd,
+ error = do_vfs_ioctl(fd_file(f), fd, cmd,
(unsigned long)compat_ptr(arg));
if (error != -ENOIOCTLCMD)
break;
- if (f.file->f_op->compat_ioctl)
- error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
+ if (fd_file(f)->f_op->compat_ioctl)
+ error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg);
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
break;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9b4ca3811a24..11ea747228ae 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,7 +23,6 @@
#define IOEND_BATCH_SIZE 4096
-typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -1022,13 +1021,14 @@ retry:
ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(i),
.flags = IOMAP_WRITE,
+ .private = private,
};
ssize_t ret;
@@ -1046,15 +1046,14 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
unsigned int first_blk, last_blk, i;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
- int ret = 0;
/*
* When we have per-block dirty tracking, there can be
@@ -1064,47 +1063,35 @@ static int iomap_write_delalloc_ifs_punch(struct inode *inode,
*/
ifs = folio->private;
if (!ifs)
- return ret;
+ return;
last_byte = min_t(loff_t, end_byte - 1,
folio_pos(folio) + folio_size(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i)) {
- ret = punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits);
- if (ret)
- return ret;
- }
+ if (!ifs_block_is_dirty(folio, ifs, i))
+ punch(inode, folio_pos(folio) + (i << blkbits),
+ 1 << blkbits, iomap);
}
-
- return ret;
}
-
-static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
- int ret = 0;
-
if (!folio_test_dirty(folio))
- return ret;
+ return;
/* if dirty, punch up to offset */
if (start_byte > *punch_start_byte) {
- ret = punch(inode, *punch_start_byte,
- start_byte - *punch_start_byte);
- if (ret)
- return ret;
+ punch(inode, *punch_start_byte, start_byte - *punch_start_byte,
+ iomap);
}
/* Punch non-dirty blocks within folio */
- ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
- end_byte, punch);
- if (ret)
- return ret;
+ iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte,
+ iomap, punch);
/*
* Make sure the next punch start is correctly bound to
@@ -1112,8 +1099,6 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
*/
*punch_start_byte = min_t(loff_t, end_byte,
folio_pos(folio) + folio_size(folio));
-
- return ret;
}
/*
@@ -1133,13 +1118,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
* simplify range iterations.
*/
-static int iomap_write_delalloc_scan(struct inode *inode,
+static void iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- iomap_punch_t punch)
+ struct iomap *iomap, iomap_punch_t punch)
{
while (start_byte < end_byte) {
struct folio *folio;
- int ret;
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
@@ -1150,20 +1134,14 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue;
}
- ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
- start_byte, end_byte, punch);
- if (ret) {
- folio_unlock(folio);
- folio_put(folio);
- return ret;
- }
+ iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+ start_byte, end_byte, iomap, punch);
/* move offset to start of next folio in range */
start_byte = folio_next_index(folio) << PAGE_SHIFT;
folio_unlock(folio);
folio_put(folio);
}
- return 0;
}
/*
@@ -1199,12 +1177,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
* the code to subtle off-by-one bugs....
*/
-static int iomap_write_delalloc_release(struct inode *inode,
- loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
+static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+ loff_t end_byte, unsigned flags, struct iomap *iomap,
+ iomap_punch_t punch)
{
loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
- int error = 0;
/*
* Lock the mapping to avoid races with page faults re-instantiating
@@ -1221,13 +1199,15 @@ static int iomap_write_delalloc_release(struct inode *inode,
/*
* If there is no more data to scan, all that is left is to
* punch out the remaining range.
+ *
+ * Note that mapping_seek_hole_data is only supposed to return
+ * either an offset or -ENXIO, so WARN on any other error as
+ * that would be an API change without updating the callers.
*/
if (start_byte == -ENXIO || start_byte == scan_end_byte)
break;
- if (start_byte < 0) {
- error = start_byte;
+ if (WARN_ON_ONCE(start_byte < 0))
goto out_unlock;
- }
WARN_ON_ONCE(start_byte < punch_start_byte);
WARN_ON_ONCE(start_byte > scan_end_byte);
@@ -1237,28 +1217,31 @@ static int iomap_write_delalloc_release(struct inode *inode,
*/
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
scan_end_byte, SEEK_HOLE);
- if (data_end < 0) {
- error = data_end;
+ if (WARN_ON_ONCE(data_end < 0))
goto out_unlock;
- }
- WARN_ON_ONCE(data_end <= start_byte);
+
+ /*
+ * If we race with post-direct I/O invalidation of the page cache,
+ * there might be no data left at start_byte.
+ */
+ if (data_end == start_byte)
+ continue;
+
+ WARN_ON_ONCE(data_end < start_byte);
WARN_ON_ONCE(data_end > scan_end_byte);
- error = iomap_write_delalloc_scan(inode, &punch_start_byte,
- start_byte, data_end, punch);
- if (error)
- goto out_unlock;
+ iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte,
+ data_end, iomap, punch);
/* The next data search starts at the end of this one. */
start_byte = data_end;
}
if (punch_start_byte < end_byte)
- error = punch(inode, punch_start_byte,
- end_byte - punch_start_byte);
+ punch(inode, punch_start_byte, end_byte - punch_start_byte,
+ iomap);
out_unlock:
filemap_invalidate_unlock(inode->i_mapping);
- return error;
}
/*
@@ -1291,20 +1274,20 @@ out_unlock:
* ->punch
* internal filesystem allocation lock
*/
-int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- struct iomap *iomap, loff_t pos, loff_t length,
- ssize_t written, iomap_punch_t punch)
+void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ loff_t pos, loff_t length, ssize_t written, unsigned flags,
+ struct iomap *iomap, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
unsigned int blocksize = i_blocksize(inode);
if (iomap->type != IOMAP_DELALLOC)
- return 0;
+ return;
/* If we didn't reserve the blocks, we're not allowed to punch them. */
if (!(iomap->flags & IOMAP_F_NEW))
- return 0;
+ return;
/*
* start_byte refers to the first unused block after a short write. If
@@ -1319,26 +1302,35 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
/* Nothing to do if we've written the entire delalloc extent */
if (start_byte >= end_byte)
- return 0;
+ return;
- return iomap_write_delalloc_release(inode, start_byte, end_byte,
- punch);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
+ punch);
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* don't bother with blocks that are not shared to start with */
+ /* Don't bother with blocks that are not shared to start with. */
if (!(iomap->flags & IOMAP_F_SHARED))
return length;
- /* don't bother with holes or unwritten extents */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+
+ /*
+ * Don't bother with holes or unwritten extents.
+ *
+ * Note that we use srcmap directly instead of iomap_iter_srcmap as
+ * unsharing requires providing a separate source map, and the presence
+ * of one is a good indicator that unsharing is needed, unlike
+ * IOMAP_F_SHARED which can be set for any data that goes into the COW
+ * fork for XFS.
+ */
+ if (iter->srcmap.type == IOMAP_HOLE ||
+ iter->srcmap.type == IOMAP_UNWRITTEN)
return length;
do {
@@ -1393,16 +1385,53 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+/*
+ * Flush the remaining range of the iter and mark the current mapping stale.
+ * This is used when zero range sees an unwritten mapping that may have had
+ * dirty pagecache over it.
+ */
+static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
+{
+ struct address_space *mapping = i->inode->i_mapping;
+ loff_t end = i->pos + i->len - 1;
+
+ i->iomap.flags |= IOMAP_F_STALE;
+ return filemap_write_and_wait_range(mapping, i->pos, end);
+}
+
+static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+ bool *range_dirty)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* already zeroed? we're done. */
- if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ /*
+ * We must zero subranges of unwritten mappings that might be dirty in
+ * pagecache from previous writes. We only know whether the entire range
+ * was clean or not, however, and dirty folios may have been written
+ * back or reclaimed at any point after mapping lookup.
+ *
+ * The easiest way to deal with this is to flush pagecache to trigger
+ * any pending unwritten conversions and then grab the updated extents
+ * from the fs. The flush may change the current mapping, so mark it
+ * stale for the iterator to remap it for the next pass to handle
+ * properly.
+ *
+ * Note that holes are treated the same as unwritten because zero range
+ * is (ab)used for partial folio zeroing in some cases. Hole backed
+ * post-eof ranges can be dirtied via mapped write and the flush
+ * triggers writeback time post-eof zeroing.
+ */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) {
+ if (*range_dirty) {
+ *range_dirty = false;
+ return iomap_zero_iter_flush_and_stale(iter);
+ }
+ /* range is clean and already zeroed, nothing to do */
return length;
+ }
do {
struct folio *folio;
@@ -1450,9 +1479,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.flags = IOMAP_ZERO,
};
int ret;
+ bool range_dirty;
+
+ /*
+ * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but
+ * pagecache must be flushed to ensure stale data from previous
+ * buffered writes is not exposed. A flush is only required for certain
+ * types of mappings, but checking pagecache after mapping lookup is
+ * racy with writeback and reclaim.
+ *
+ * Therefore, check the entire range first and pass along whether any
+ * part of it is dirty. If so and an underlying mapping warrants it,
+ * flush the cache at that point. This trades off the occasional false
+ * positive (and spurious flush, if the dirty data and mapping don't
+ * happen to overlap) for simplicity in handling a relatively uncommon
+ * situation.
+ */
+ range_dirty = filemap_range_needs_writeback(inode->i_mapping,
+ pos, pos + len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_zero_range);
@@ -2007,10 +2054,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-static int __init iomap_init(void)
+static int __init iomap_buffered_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
-fs_initcall(iomap_init);
+fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f3b43d223a46..f637aa0706a3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -27,6 +27,13 @@
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
+/*
+ * Used for sub block zeroing in iomap_dio_zero()
+ */
+#define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
+#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
+static struct page *zero_page;
+
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
@@ -232,13 +239,20 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
-static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
+static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
- struct page *page = ZERO_PAGE(0);
struct bio *bio;
+ if (!len)
+ return 0;
+ /*
+ * Max block size supported is 64k
+ */
+ if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE))
+ return -EINVAL;
+
bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
@@ -246,8 +260,9 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- __bio_add_page(bio, page, len, 0);
+ __bio_add_page(bio, zero_page, len, 0);
iomap_dio_submit_bio(iter, dio, bio, pos);
+ return 0;
}
/*
@@ -356,8 +371,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
- if (pad)
- iomap_dio_zero(iter, dio, pos - pad, pad);
+
+ ret = iomap_dio_zero(iter, dio, pos - pad, pad);
+ if (ret)
+ goto out;
}
/*
@@ -431,7 +448,8 @@ zero_tail:
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
if (pad)
- iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
+ ret = iomap_dio_zero(iter, dio, pos,
+ fs_block_size - pad);
}
out:
/* Undo iter limitation to current extent */
@@ -753,3 +771,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static int __init iomap_dio_init(void)
+{
+ zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ IOMAP_ZERO_PAGE_ORDER);
+
+ if (!zero_page)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(iomap_dio_init);
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index ee9660e9671c..7755e587f778 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -44,7 +44,7 @@ struct RR_PN_s {
struct SL_component {
__u8 flags;
__u8 len;
- __u8 text[];
+ __u8 text[] __counted_by(len);
} __attribute__ ((packed));
struct RR_SL_s {
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 951f78634adf..b3971e91e8eb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -79,17 +79,23 @@ __releases(&journal->j_state_lock)
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
+ bool has_transaction = false;
- if (journal->j_committing_transaction)
+ if (journal->j_committing_transaction) {
tid = journal->j_committing_transaction->t_tid;
+ has_transaction = true;
+ }
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
if (chkpt) {
jbd2_log_do_checkpoint(journal);
- } else if (jbd2_cleanup_journal_tail(journal) == 0) {
- /* We were able to recover space; yay! */
+ } else if (jbd2_cleanup_journal_tail(journal) <= 0) {
+ /*
+ * We were able to recover space or the
+ * journal was aborted due to an error.
+ */
;
- } else if (tid) {
+ } else if (has_transaction) {
/*
* jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED
@@ -407,6 +413,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
tid_t tid = 0;
unsigned long nr_freed = 0;
unsigned long freed;
+ bool first_set = false;
again:
spin_lock(&journal->j_list_lock);
@@ -426,8 +433,10 @@ again:
else
transaction = journal->j_checkpoint_transactions;
- if (!first_tid)
+ if (!first_set) {
first_tid = transaction->t_tid;
+ first_set = true;
+ }
last_transaction = journal->j_checkpoint_transactions->t_cpprev;
next_transaction = transaction;
last_tid = last_transaction->t_tid;
@@ -457,7 +466,7 @@ again:
spin_unlock(&journal->j_list_lock);
cond_resched();
- if (*nr_to_scan && next_tid)
+ if (*nr_to_scan && journal->j_shrink_transaction)
goto again;
out:
trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1ebf2393bfb7..97f487c3d8fc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -281,6 +281,16 @@ static void journal_kill_thread(journal_t *journal)
write_unlock(&journal->j_state_lock);
}
+static inline bool jbd2_data_needs_escaping(char *data)
+{
+ return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER);
+}
+
+static inline void jbd2_data_do_escape(char *data)
+{
+ *((unsigned int *)data) = 0;
+}
+
/*
* jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
*
@@ -318,9 +328,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out,
sector_t blocknr)
{
- int done_copy_out = 0;
int do_escape = 0;
- char *mapped_data;
struct buffer_head *new_bh;
struct folio *new_folio;
unsigned int new_offset;
@@ -349,37 +357,33 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
* we use that version of the data for the commit.
*/
if (jh_in->b_frozen_data) {
- done_copy_out = 1;
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
+ do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data);
+ if (do_escape)
+ jbd2_data_do_escape(jh_in->b_frozen_data);
} else {
+ char *tmp;
+ char *mapped_data;
+
new_folio = bh_in->b_folio;
new_offset = offset_in_folio(new_folio, bh_in->b_data);
- }
-
- mapped_data = kmap_local_folio(new_folio, new_offset);
- /*
- * Fire data frozen trigger if data already wasn't frozen. Do this
- * before checking for escaping, as the trigger may modify the magic
- * offset. If a copy-out happens afterwards, it will have the correct
- * data in the buffer.
- */
- if (!done_copy_out)
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ /*
+ * Fire data frozen trigger if data already wasn't frozen. Do
+ * this before checking for escaping, as the trigger may modify
+ * the magic offset. If a copy-out happens afterwards, it will
+ * have the correct data in the buffer.
+ */
jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
-
- /*
- * Check for escaping
- */
- if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
- do_escape = 1;
- kunmap_local(mapped_data);
-
- /*
- * Do we need to do a data copy?
- */
- if (do_escape && !done_copy_out) {
- char *tmp;
+ do_escape = jbd2_data_needs_escaping(mapped_data);
+ kunmap_local(mapped_data);
+ /*
+ * Do we need to do a data copy?
+ */
+ if (!do_escape)
+ goto escape_done;
spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
@@ -406,18 +410,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
copy_done:
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
- done_copy_out = 1;
+ jbd2_data_do_escape(jh_in->b_frozen_data);
}
- /*
- * Did we need to do an escaping? Now we've done all the
- * copying, we can finally do so.
- * b_frozen_data is from jbd2_alloc() which always provides an
- * address from the direct kernels mapping.
- */
- if (do_escape)
- *((unsigned int *)jh_in->b_frozen_data) = 0;
-
+escape_done:
folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
@@ -710,7 +706,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
return -EINVAL;
write_lock(&journal->j_state_lock);
- if (tid <= journal->j_commit_sequence) {
+ if (tid_geq(journal->j_commit_sequence, tid)) {
write_unlock(&journal->j_state_lock);
return -EALREADY;
}
@@ -740,9 +736,9 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
*/
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
- jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid);
+ jbd2_journal_unlock_updates(journal);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
@@ -841,17 +837,12 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
*bh_out = NULL;
- if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
- fc_off = journal->j_fc_off;
- blocknr = journal->j_fc_first + fc_off;
- journal->j_fc_off++;
- } else {
- ret = -EINVAL;
- }
-
- if (ret)
- return ret;
+ if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last)
+ return -EINVAL;
+ fc_off = journal->j_fc_off;
+ blocknr = journal->j_fc_first + fc_off;
+ journal->j_fc_off++;
ret = jbd2_journal_bmap(journal, blocknr, &pblock);
if (ret)
return ret;
@@ -860,7 +851,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
if (!bh)
return -ENOMEM;
-
journal->j_fc_wbuf[fc_off] = bh;
*bh_out = bh;
@@ -903,7 +893,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
}
EXPORT_SYMBOL(jbd2_fc_wait_bufs);
-int jbd2_fc_release_bufs(journal_t *journal)
+void jbd2_fc_release_bufs(journal_t *journal)
{
struct buffer_head *bh;
int i, j_fc_off;
@@ -917,8 +907,6 @@ int jbd2_fc_release_bufs(journal_t *journal)
put_bh(bh);
journal->j_fc_wbuf[i] = NULL;
}
-
- return 0;
}
EXPORT_SYMBOL(jbd2_fc_release_bufs);
@@ -1944,7 +1932,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
if (had_fast_commit)
jbd2_set_feature_fast_commit(journal);
- /* Log is no longer empty */
+ /* Log is empty */
write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
@@ -2866,8 +2854,7 @@ static struct journal_head *journal_alloc_journal_head(void)
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
- if (ret)
- spin_lock_init(&ret->b_state_lock);
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index c429c42a6867..9ff37ae650ea 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -178,10 +178,10 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
struct fd f = fdget(fd);
ssize_t ret = -EBADF;
- if (!f.file || !(f.file->f_mode & FMODE_READ))
+ if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
goto out;
- ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
+ ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
out:
fdput(f);
return ret;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index c11516801784..5e6877c37f73 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -440,7 +440,7 @@ nlm_bind_host(struct nlm_host *host)
if ((clnt = host->h_rpcclnt) != NULL) {
nlm_rebind_host(host);
} else {
- unsigned long increment = nlmsvc_timeout;
+ unsigned long increment = nlm_timeout * HZ;
struct rpc_timeout timeparms = {
.to_initval = increment,
.to_increment = increment,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ab8042a5b895..4ec22c2f2ea3 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,7 +53,6 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
static struct svc_serv *nlmsvc_serv;
-unsigned long nlmsvc_timeout;
static void nlmsvc_request_retry(struct timer_list *tl)
{
@@ -68,7 +67,7 @@ unsigned int lockd_net_id;
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
*/
static unsigned long nlm_grace_period;
-static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
+unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
static int nlm_udpport, nlm_tcpport;
/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
@@ -125,6 +124,8 @@ lockd(void *vrqstp)
struct net *net = &init_net;
struct lockd_net *ln = net_generic(net, lockd_net_id);
+ svc_thread_init_status(rqstp, 0);
+
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -333,10 +334,6 @@ static int lockd_get(void)
printk(KERN_WARNING
"lockd_up: no pid, %d users??\n", nlmsvc_users);
- if (!nlm_timeout)
- nlm_timeout = LOCKD_DFLT_TIMEO;
- nlmsvc_timeout = nlm_timeout * HZ;
-
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
diff --git a/fs/locks.c b/fs/locks.c
index b51b1c395ce6..204847628f3e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2157,15 +2157,15 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
error = -EBADF;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return error;
- if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
+ if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
goto out_putf;
- flock_make_lock(f.file, &fl, type);
+ flock_make_lock(fd_file(f), &fl, type);
- error = security_file_lock(f.file, fl.c.flc_type);
+ error = security_file_lock(fd_file(f), fl.c.flc_type);
if (error)
goto out_putf;
@@ -2173,12 +2173,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (can_sleep)
fl.c.flc_flags |= FL_SLEEP;
- if (f.file->f_op->flock)
- error = f.file->f_op->flock(f.file,
+ if (fd_file(f)->f_op->flock)
+ error = fd_file(f)->f_op->flock(fd_file(f),
(can_sleep) ? F_SETLKW : F_SETLK,
&fl);
else
- error = locks_lock_file_wait(f.file, &fl);
+ error = locks_lock_file_wait(fd_file(f), &fl);
locks_release_private(&fl);
out_putf:
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 79491663dbc0..7b1df8cc2821 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -32,6 +32,15 @@ struct mnt_idmap nop_mnt_idmap = {
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);
+/*
+ * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range.
+ * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID.
+ */
+struct mnt_idmap invalid_mnt_idmap = {
+ .count = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL_GPL(invalid_mnt_idmap);
+
/**
* initial_idmapping - check whether this is the initial mapping
* @ns: idmapping to check
@@ -75,6 +84,8 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
if (idmap == &nop_mnt_idmap)
return VFSUIDT_INIT(kuid);
+ if (idmap == &invalid_mnt_idmap)
+ return INVALID_VFSUID;
if (initial_idmapping(fs_userns))
uid = __kuid_val(kuid);
else
@@ -112,6 +123,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
if (idmap == &nop_mnt_idmap)
return VFSGIDT_INIT(kgid);
+ if (idmap == &invalid_mnt_idmap)
+ return INVALID_VFSGID;
if (initial_idmapping(fs_userns))
gid = __kgid_val(kgid);
else
@@ -140,6 +153,8 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
if (idmap == &nop_mnt_idmap)
return AS_KUIDT(vfsuid);
+ if (idmap == &invalid_mnt_idmap)
+ return INVALID_UID;
uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
if (uid == (uid_t)-1)
return INVALID_UID;
@@ -167,6 +182,8 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
if (idmap == &nop_mnt_idmap)
return AS_KGIDT(vfsgid);
+ if (idmap == &invalid_mnt_idmap)
+ return INVALID_GID;
gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
if (gid == (gid_t)-1)
return INVALID_GID;
@@ -296,7 +313,7 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
*/
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
{
- if (idmap != &nop_mnt_idmap)
+ if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap)
refcount_inc(&idmap->count);
return idmap;
@@ -312,7 +329,8 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
*/
void mnt_idmap_put(struct mnt_idmap *idmap)
{
- if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+ if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap &&
+ refcount_dec_and_test(&idmap->count))
free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/namei.c b/fs/namei.c
index 891b169e38c9..4a4a22a08ac2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2506,25 +2506,25 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;
- if (!f.file)
+ if (!fd_file(f))
return ERR_PTR(-EBADF);
if (flags & LOOKUP_LINKAT_EMPTY) {
- if (f.file->f_cred != current_cred() &&
- !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+ if (fd_file(f)->f_cred != current_cred() &&
+ !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
fdput(f);
return ERR_PTR(-ENOENT);
}
}
- dentry = f.file->f_path.dentry;
+ dentry = fd_file(f)->f_path.dentry;
if (*s && unlikely(!d_can_lookup(dentry))) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}
- nd->path = f.file->f_path;
+ nd->path = fd_file(f)->f_path;
if (flags & LOOKUP_RCU) {
nd->inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
diff --git a/fs/namespace.c b/fs/namespace.c
index e71e4564987b..93c377816d75 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4134,14 +4134,14 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
}
f = fdget(fs_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
ret = -EINVAL;
- if (f.file->f_op != &fscontext_fops)
+ if (fd_file(f)->f_op != &fscontext_fops)
goto err_fsfd;
- fc = f.file->private_data;
+ fc = fd_file(f)->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
@@ -4471,6 +4471,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
return -EINVAL;
+ /* The filesystem has turned off idmapped mounts. */
+ if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
+ return -EINVAL;
+
/* We're not controlling the superblock. */
if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
return -EPERM;
@@ -4684,15 +4688,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
return -EINVAL;
f = fdget(attr->userns_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!proc_ns_file(f.file)) {
+ if (!proc_ns_file(fd_file(f))) {
err = -EINVAL;
goto out_fput;
}
- ns = get_proc_ns(file_inode(f.file));
+ ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops->type != CLONE_NEWUSER) {
err = -EINVAL;
goto out_fput;
@@ -5292,13 +5296,13 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
struct ns_common *ns;
CLASS(fd, f)(kreq->spare);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (!proc_ns_file(f.file))
+ if (!proc_ns_file(fd_file(f)))
return ERR_PTR(-EINVAL);
- ns = get_proc_ns(file_inode(f.file));
+ ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops->type != CLONE_NEWNS)
return ERR_PTR(-EINVAL);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index d7eae597e54d..b3910dfcb56d 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -552,6 +552,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
netfs_set_group(folio, netfs_group);
file_update_time(file);
+ set_bit(NETFS_ICTX_MODIFIED_ATTR, &ictx->flags);
if (ictx->ops->post_modify)
ictx->ops->post_modify(inode);
ret = VM_FAULT_LOCKED;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index c9f0ed24cb7b..c562aec3b483 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -58,6 +58,7 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq);
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
bool needs_put);
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 0ad0982ce0e2..63280791de3b 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -9,34 +9,66 @@
#include "internal.h"
/*
- * Append a folio to the rolling queue.
+ * Make sure there's space in the rolling queue.
*/
-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
- bool needs_put)
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq)
{
- struct folio_queue *tail = rreq->buffer_tail;
- unsigned int slot, order = folio_order(folio);
+ struct folio_queue *tail = rreq->buffer_tail, *prev;
+ unsigned int prev_nr_slots = 0;
if (WARN_ON_ONCE(!rreq->buffer && tail) ||
WARN_ON_ONCE(rreq->buffer && !tail))
- return -EIO;
-
- if (!tail || folioq_full(tail)) {
- tail = kmalloc(sizeof(*tail), GFP_NOFS);
- if (!tail)
- return -ENOMEM;
- netfs_stat(&netfs_n_folioq);
- folioq_init(tail);
- tail->prev = rreq->buffer_tail;
- if (tail->prev)
- tail->prev->next = tail;
- rreq->buffer_tail = tail;
- if (!rreq->buffer) {
- rreq->buffer = tail;
- iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
+ return ERR_PTR(-EIO);
+
+ prev = tail;
+ if (prev) {
+ if (!folioq_full(tail))
+ return tail;
+ prev_nr_slots = folioq_nr_slots(tail);
+ }
+
+ tail = kmalloc(sizeof(*tail), GFP_NOFS);
+ if (!tail)
+ return ERR_PTR(-ENOMEM);
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(tail);
+ tail->prev = prev;
+ if (prev)
+ /* [!] NOTE: After we set prev->next, the consumer is entirely
+ * at liberty to delete prev.
+ */
+ WRITE_ONCE(prev->next, tail);
+
+ rreq->buffer_tail = tail;
+ if (!rreq->buffer) {
+ rreq->buffer = tail;
+ iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
+ } else {
+ /* Make sure we don't leave the master iterator pointing to a
+ * block that might get immediately consumed.
+ */
+ if (rreq->io_iter.folioq == prev &&
+ rreq->io_iter.folioq_slot == prev_nr_slots) {
+ rreq->io_iter.folioq = tail;
+ rreq->io_iter.folioq_slot = 0;
}
- rreq->buffer_tail_slot = 0;
}
+ rreq->buffer_tail_slot = 0;
+ return tail;
+}
+
+/*
+ * Append a folio to the rolling queue.
+ */
+int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
+ bool needs_put)
+{
+ struct folio_queue *tail;
+ unsigned int slot, order = folio_order(folio);
+
+ tail = netfs_buffer_make_space(rreq);
+ if (IS_ERR(tail))
+ return PTR_ERR(tail);
rreq->io_iter.count += PAGE_SIZE << order;
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 04e66d587f77..6293f547e4c3 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -153,12 +153,22 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
loff_t start)
{
struct netfs_io_subrequest *subreq;
+ struct iov_iter *wreq_iter = &wreq->io_iter;
+
+ /* Make sure we don't point the iterator at a used-up folio_queue
+ * struct being used as a placeholder to prevent the queue from
+ * collapsing. In such a case, extend the queue.
+ */
+ if (iov_iter_is_folioq(wreq_iter) &&
+ wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) {
+ netfs_buffer_make_space(wreq);
+ }
subreq = netfs_alloc_subrequest(wreq);
subreq->source = stream->source;
subreq->start = start;
subreq->stream_nr = stream->stream_nr;
- subreq->io_iter = wreq->io_iter;
+ subreq->io_iter = *wreq_iter;
_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
@@ -307,6 +317,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
struct netfs_io_stream *stream;
struct netfs_group *fgroup; /* TODO: Use this with ceph */
struct netfs_folio *finfo;
+ size_t iter_off = 0;
size_t fsize = folio_size(folio), flen = fsize, foff = 0;
loff_t fpos = folio_pos(folio), i_size;
bool to_eof = false, streamw = false;
@@ -462,7 +473,12 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (choose_s < 0)
break;
stream = &wreq->io_streams[choose_s];
- wreq->io_iter.iov_offset = stream->submit_off;
+
+ /* Advance the iterator(s). */
+ if (stream->submit_off > iter_off) {
+ iov_iter_advance(&wreq->io_iter, stream->submit_off - iter_off);
+ iter_off = stream->submit_off;
+ }
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_extendable_to = fsize - stream->submit_off;
@@ -477,8 +493,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
debug = true;
}
- wreq->io_iter.iov_offset = 0;
- iov_iter_advance(&wreq->io_iter, fsize);
+ if (fsize > iter_off)
+ iov_iter_advance(&wreq->io_iter, fsize - iter_off);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug)
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 57249f040dfc..0eb20012792f 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -4,6 +4,7 @@ config NFS_FS
depends on INET && FILE_LOCKING && MULTIUSER
select LOCKD
select SUNRPC
+ select NFS_COMMON
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
Choose Y here if you want to access files residing on other
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 5f6db37f461e..9fb2f2cac87e 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -13,6 +13,7 @@ nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
nfs-$(CONFIG_NFS_FSCACHE) += fscache.o
+nfs-$(CONFIG_NFS_LOCALIO) += localio.o
obj-$(CONFIG_NFS_V2) += nfsv2.o
nfsv2-y := nfs2super.o proc.o nfs2xdr.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 8adfcd4c8c1a..6cf92498a5ac 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,6 +76,8 @@ nfs4_callback_svc(void *vrqstp)
{
struct svc_rqst *rqstp = vrqstp;
+ svc_thread_init_status(rqstp, 0);
+
set_freezable();
while (!svc_thread_should_stop(rqstp))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8286edd6062d..a1d21c4be0ac 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -178,6 +178,14 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
clp->cl_net = get_net(cl_init->net);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ seqlock_init(&clp->cl_boot_lock);
+ ktime_get_real_ts64(&clp->cl_nfssvc_boot);
+ clp->cl_uuid.net = NULL;
+ clp->cl_uuid.dom = NULL;
+ spin_lock_init(&clp->cl_localio_lock);
+#endif /* CONFIG_NFS_LOCALIO */
+
clp->cl_principal = "*";
clp->cl_xprtsec = cl_init->xprtsec;
return clp;
@@ -233,6 +241,8 @@ static void pnfs_init_server(struct nfs_server *server)
*/
void nfs_free_client(struct nfs_client *clp)
{
+ nfs_local_disable(clp);
+
/* -EIO all pending I/O */
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
@@ -424,7 +434,10 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
list_add_tail(&new->cl_share_link,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
- return rpc_ops->init_client(new, cl_init);
+ new = rpc_ops->init_client(new, cl_init);
+ if (!IS_ERR(new))
+ nfs_local_probe(new);
+ return new;
}
spin_unlock(&nn->nfs_client_lock);
@@ -997,8 +1010,8 @@ struct nfs_server *nfs_alloc_server(void)
init_waitqueue_head(&server->write_congestion_wait);
atomic_long_set(&server->writeback, 0);
- ida_init(&server->openowner_id);
- ida_init(&server->lockowner_id);
+ atomic64_set(&server->owner_ctr, 0);
+
pnfs_init_server(server);
rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
@@ -1037,8 +1050,6 @@ void nfs_free_server(struct nfs_server *server)
}
ida_free(&s_sysfs_ids, server->s_sysfs_id);
- ida_destroy(&server->lockowner_id);
- ida_destroy(&server->openowner_id);
put_cred(server->cred);
nfs_release_automount_timer();
call_rcu(&server->rcu, delayed_free);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4cb97ef41350..492cffd9d3d8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -151,7 +151,7 @@ struct nfs_cache_array {
unsigned char folio_full : 1,
folio_is_eof : 1,
cookies_are_ordered : 1;
- struct nfs_cache_array_entry array[];
+ struct nfs_cache_array_entry array[] __counted_by(size);
};
struct nfs_readdir_descriptor {
@@ -328,7 +328,8 @@ static int nfs_readdir_folio_array_append(struct folio *folio,
goto out;
}
- cache_entry = &array->array[array->size];
+ array->size++;
+ cache_entry = &array->array[array->size - 1];
cache_entry->cookie = array->last_cookie;
cache_entry->ino = entry->ino;
cache_entry->d_type = entry->d_type;
@@ -337,7 +338,6 @@ static int nfs_readdir_folio_array_append(struct folio *folio,
array->last_cookie = entry->cookie;
if (array->last_cookie <= cache_entry->cookie)
array->cookies_are_ordered = 0;
- array->size++;
if (entry->eof != 0)
nfs_readdir_array_set_eof(array);
out:
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index b6e9aeaf4ce2..d39a1f58e18d 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -488,7 +488,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
/* Perform an asynchronous read to ds */
nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
- 0, RPC_TASK_SOFTCONN);
+ 0, RPC_TASK_SOFTCONN, NULL);
return PNFS_ATTEMPTED;
}
@@ -530,7 +530,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
/* Perform an asynchronous write */
nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
- sync, RPC_TASK_SOFTCONN);
+ sync, RPC_TASK_SOFTCONN, NULL);
return PNFS_ATTEMPTED;
}
@@ -1011,7 +1011,7 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
data->args.fh = fh;
return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
&filelayout_commit_call_ops, how,
- RPC_TASK_SOFTCONN);
+ RPC_TASK_SOFTCONN, NULL);
out_err:
pnfs_generic_prepare_to_resend_writes(data);
pnfs_generic_commit_release(data);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 39ba9f4208aa..f78115c6c2c1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -11,6 +11,7 @@
#include <linux/nfs_mount.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/file.h>
#include <linux/sched/mm.h>
#include <linux/sunrpc/metrics.h>
@@ -162,6 +163,21 @@ decode_name(struct xdr_stream *xdr, u32 *id)
return 0;
}
+static struct nfsd_file *
+ff_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ struct nfs_fh *fh, fmode_t mode)
+{
+ if (mode & FMODE_WRITE) {
+ /*
+ * Always request read and write access since this corresponds
+ * to a rw layout.
+ */
+ mode |= FMODE_READ;
+ }
+
+ return nfs_local_open_fh(clp, cred, fh, mode);
+}
+
static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
const struct nfs4_ff_layout_mirror *m2)
{
@@ -237,7 +253,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
{
- const struct cred *cred;
+ const struct cred *cred;
ff_layout_remove_mirror(mirror);
kfree(mirror->fh_versions);
@@ -1756,6 +1772,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
@@ -1802,11 +1819,18 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->args.offset = offset;
hdr->mds_offset = offset;
+ /* Start IO accounting for local read */
+ localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh, FMODE_READ);
+ if (localio) {
+ hdr->task.tk_start = ktime_get();
+ ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
+ }
+
/* Perform an asynchronous read to ds */
nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_read_call_ops_v3 :
&ff_layout_read_call_ops_v4,
- 0, RPC_TASK_SOFTCONN);
+ 0, RPC_TASK_SOFTCONN, localio);
put_cred(ds_cred);
return PNFS_ATTEMPTED;
@@ -1826,6 +1850,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
struct pnfs_layout_segment *lseg = hdr->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
loff_t offset = hdr->args.offset;
@@ -1870,11 +1895,19 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
*/
hdr->args.offset = offset;
+ /* Start IO accounting for local write */
+ localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+ FMODE_READ|FMODE_WRITE);
+ if (localio) {
+ hdr->task.tk_start = ktime_get();
+ ff_layout_write_record_layoutstats_start(&hdr->task, hdr);
+ }
+
/* Perform an asynchronous write */
nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_write_call_ops_v3 :
&ff_layout_write_call_ops_v4,
- sync, RPC_TASK_SOFTCONN);
+ sync, RPC_TASK_SOFTCONN, localio);
put_cred(ds_cred);
return PNFS_ATTEMPTED;
@@ -1908,6 +1941,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
struct pnfs_layout_segment *lseg = data->lseg;
struct nfs4_pnfs_ds *ds;
struct rpc_clnt *ds_clnt;
+ struct nfsd_file *localio;
struct nfs4_ff_layout_mirror *mirror;
const struct cred *ds_cred;
u32 idx;
@@ -1946,10 +1980,18 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
if (fh)
data->args.fh = fh;
+ /* Start IO accounting for local commit */
+ localio = ff_local_open_fh(ds->ds_clp, ds_cred, fh,
+ FMODE_READ|FMODE_WRITE);
+ if (localio) {
+ data->task.tk_start = ktime_get();
+ ff_layout_commit_record_layoutstats_start(&data->task, data);
+ }
+
ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
- how, RPC_TASK_SOFTCONN);
+ how, RPC_TASK_SOFTCONN, localio);
put_cred(ds_cred);
return ret;
out_err:
@@ -2087,12 +2129,6 @@ static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
}
static void
-encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
-{
- WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
-}
-
-static void
ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
const nfs4_stateid *stateid,
const struct nfs42_layoutstat_devinfo *devinfo)
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e028f5a0ef5f..e58bedfb1dcc 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -395,6 +395,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
/* connect success, check rsize/wsize limit */
if (!status) {
+ /*
+ * ds_clp is put in destroy_ds().
+ * keep ds_clp even if DS is local, so that if local IO cannot
+ * proceed somehow, we can fall back to NFS whenever we want.
+ */
+ nfs_local_probe(ds->ds_clp);
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 6c9f3f6645dd..7e000d782e28 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -49,6 +49,7 @@ enum nfs_param {
Opt_bsize,
Opt_clientaddr,
Opt_cto,
+ Opt_alignwrite,
Opt_fg,
Opt_fscache,
Opt_fscache_flag,
@@ -149,6 +150,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_u32 ("bsize", Opt_bsize),
fsparam_string("clientaddr", Opt_clientaddr),
fsparam_flag_no("cto", Opt_cto),
+ fsparam_flag_no("alignwrite", Opt_alignwrite),
fsparam_flag ("fg", Opt_fg),
fsparam_flag_no("fsc", Opt_fscache_flag),
fsparam_string("fsc", Opt_fscache),
@@ -592,6 +594,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
else
ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
break;
+ case Opt_alignwrite:
+ if (result.negated)
+ ctx->flags |= NFS_MOUNT_NO_ALIGNWRITE;
+ else
+ ctx->flags &= ~NFS_MOUNT_NO_ALIGNWRITE;
+ break;
case Opt_ac:
if (result.negated)
ctx->flags |= NFS_MOUNT_NOAC;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 11ff2b2e060f..f13d25d95b85 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -62,7 +62,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
}
/*
- * get an NFS2/NFS3 root dentry from the root filehandle
+ * get a root dentry from the root filehandle
*/
int nfs_get_root(struct super_block *s, struct fs_context *fc)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b4914a11c3c2..542c7d97b235 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2461,35 +2461,54 @@ static void nfs_destroy_inodecache(void)
kmem_cache_destroy(nfs_inode_cachep);
}
+struct workqueue_struct *nfslocaliod_workqueue;
struct workqueue_struct *nfsiod_workqueue;
EXPORT_SYMBOL_GPL(nfsiod_workqueue);
/*
- * start up the nfsiod workqueue
+ * Destroy the nfsiod workqueues
*/
-static int nfsiod_start(void)
+static void nfsiod_stop(void)
{
struct workqueue_struct *wq;
- dprintk("RPC: creating workqueue nfsiod\n");
- wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
- if (wq == NULL)
- return -ENOMEM;
- nfsiod_workqueue = wq;
- return 0;
+
+ wq = nfsiod_workqueue;
+ if (wq != NULL) {
+ nfsiod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ wq = nfslocaliod_workqueue;
+ if (wq != NULL) {
+ nfslocaliod_workqueue = NULL;
+ destroy_workqueue(wq);
+ }
+#endif /* CONFIG_NFS_LOCALIO */
}
/*
- * Destroy the nfsiod workqueue
+ * Start the nfsiod workqueues
*/
-static void nfsiod_stop(void)
+static int nfsiod_start(void)
{
- struct workqueue_struct *wq;
-
- wq = nfsiod_workqueue;
- if (wq == NULL)
- return;
- nfsiod_workqueue = NULL;
- destroy_workqueue(wq);
+ dprintk("RPC: creating workqueue nfsiod\n");
+ nfsiod_workqueue = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ if (nfsiod_workqueue == NULL)
+ return -ENOMEM;
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ /*
+ * localio writes need to use a normal (non-memreclaim) workqueue.
+ * When we start getting low on space, XFS goes and calls flush_work() on
+ * a non-memreclaim work queue, which causes a priority inversion problem.
+ */
+ dprintk("RPC: creating workqueue nfslocaliod\n");
+ nfslocaliod_workqueue = alloc_workqueue("nfslocaliod", WQ_UNBOUND, 0);
+ if (unlikely(nfslocaliod_workqueue == NULL)) {
+ nfsiod_stop();
+ return -ENOMEM;
+ }
+#endif /* CONFIG_NFS_LOCALIO */
+ return 0;
}
unsigned int nfs_net_id;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5902a9beca1f..430733e3eff2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -9,6 +9,7 @@
#include <linux/crc32.h>
#include <linux/sunrpc/addr.h>
#include <linux/nfs_page.h>
+#include <linux/nfslocalio.h>
#include <linux/wait_bit.h>
#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS)
@@ -308,7 +309,8 @@ void nfs_pgio_header_free(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
- const struct rpc_call_ops *call_ops, int how, int flags);
+ const struct rpc_call_ops *call_ops, int how, int flags,
+ struct nfsd_file *localio);
void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
@@ -438,6 +440,7 @@ int nfs_check_flags(int);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
+extern struct workqueue_struct *nfslocaliod_workqueue;
extern struct inode *nfs_alloc_inode(struct super_block *sb);
extern void nfs_free_inode(struct inode *);
extern int nfs_write_inode(struct inode *, struct writeback_control *);
@@ -449,6 +452,51 @@ extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags);
extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+/* localio.c */
+extern void nfs_local_disable(struct nfs_client *);
+extern void nfs_local_probe(struct nfs_client *);
+extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
+ const struct cred *,
+ struct nfs_fh *,
+ const fmode_t);
+extern int nfs_local_doio(struct nfs_client *,
+ struct nfsd_file *,
+ struct nfs_pgio_header *,
+ const struct rpc_call_ops *);
+extern int nfs_local_commit(struct nfsd_file *,
+ struct nfs_commit_data *,
+ const struct rpc_call_ops *, int);
+extern bool nfs_server_is_local(const struct nfs_client *clp);
+
+#else /* CONFIG_NFS_LOCALIO */
+static inline void nfs_local_disable(struct nfs_client *clp) {}
+static inline void nfs_local_probe(struct nfs_client *clp) {}
+static inline struct nfsd_file *
+nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ struct nfs_fh *fh, const fmode_t mode)
+{
+ return NULL;
+}
+static inline int nfs_local_doio(struct nfs_client *clp,
+ struct nfsd_file *localio,
+ struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops)
+{
+ return -EINVAL;
+}
+static inline int nfs_local_commit(struct nfsd_file *localio,
+ struct nfs_commit_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ return -EINVAL;
+}
+static inline bool nfs_server_is_local(const struct nfs_client *clp)
+{
+ return false;
+}
+#endif /* CONFIG_NFS_LOCALIO */
+
/* super.c */
extern const struct super_operations nfs_sops;
bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
@@ -505,7 +553,6 @@ extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
struct nfs_open_context *ctx,
struct folio *folio);
extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
-extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
/* super.c */
@@ -528,7 +575,8 @@ extern int nfs_initiate_commit(struct rpc_clnt *clnt,
struct nfs_commit_data *data,
const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
- int how, int flags);
+ int how, int flags,
+ struct nfsd_file *localio);
extern void nfs_init_commit(struct nfs_commit_data *data,
struct list_head *head,
struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
new file mode 100644
index 000000000000..c29cdf51c458
--- /dev/null
+++ b/fs/nfs/localio.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NFS client support for local clients to bypass network stack
+ *
+ * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
+ * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ * Copyright (C) 2024 NeilBrown <neilb@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/vfs.h>
+#include <linux/file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/nfs_common.h>
+#include <linux/nfslocalio.h>
+#include <linux/module.h>
+#include <linux/bvec.h>
+
+#include <linux/nfs.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+
+#include "internal.h"
+#include "pnfs.h"
+#include "nfstrace.h"
+
+#define NFSDBG_FACILITY NFSDBG_VFS
+
+struct nfs_local_kiocb {
+ struct kiocb kiocb;
+ struct bio_vec *bvec;
+ struct nfs_pgio_header *hdr;
+ struct work_struct work;
+ struct nfsd_file *localio;
+};
+
+struct nfs_local_fsync_ctx {
+ struct nfsd_file *localio;
+ struct nfs_commit_data *data;
+ struct work_struct work;
+ struct kref kref;
+ struct completion *done;
+};
+static void nfs_local_fsync_work(struct work_struct *work);
+
+static bool localio_enabled __read_mostly = true;
+module_param(localio_enabled, bool, 0644);
+
+static inline bool nfs_client_is_local(const struct nfs_client *clp)
+{
+ return !!test_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+}
+
+bool nfs_server_is_local(const struct nfs_client *clp)
+{
+ return nfs_client_is_local(clp) && localio_enabled;
+}
+EXPORT_SYMBOL_GPL(nfs_server_is_local);
+
+/*
+ * UUID_IS_LOCAL XDR functions
+ */
+
+static void localio_xdr_enc_uuidargs(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const u8 *uuid = data;
+
+ encode_opaque_fixed(xdr, uuid, UUID_SIZE);
+}
+
+static int localio_xdr_dec_uuidres(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ void *result)
+{
+ /* void return */
+ return 0;
+}
+
+static const struct rpc_procinfo nfs_localio_procedures[] = {
+ [LOCALIOPROC_UUID_IS_LOCAL] = {
+ .p_proc = LOCALIOPROC_UUID_IS_LOCAL,
+ .p_encode = localio_xdr_enc_uuidargs,
+ .p_decode = localio_xdr_dec_uuidres,
+ .p_arglen = XDR_QUADLEN(UUID_SIZE),
+ .p_replen = 0,
+ .p_statidx = LOCALIOPROC_UUID_IS_LOCAL,
+ .p_name = "UUID_IS_LOCAL",
+ },
+};
+
+static unsigned int nfs_localio_counts[ARRAY_SIZE(nfs_localio_procedures)];
+static const struct rpc_version nfslocalio_version1 = {
+ .number = 1,
+ .nrprocs = ARRAY_SIZE(nfs_localio_procedures),
+ .procs = nfs_localio_procedures,
+ .counts = nfs_localio_counts,
+};
+
+static const struct rpc_version *nfslocalio_version[] = {
+ [1] = &nfslocalio_version1,
+};
+
+extern const struct rpc_program nfslocalio_program;
+static struct rpc_stat nfslocalio_rpcstat = { &nfslocalio_program };
+
+const struct rpc_program nfslocalio_program = {
+ .name = "nfslocalio",
+ .number = NFS_LOCALIO_PROGRAM,
+ .nrvers = ARRAY_SIZE(nfslocalio_version),
+ .version = nfslocalio_version,
+ .stats = &nfslocalio_rpcstat,
+};
+
+/*
+ * nfs_local_enable - enable local i/o for an nfs_client
+ */
+static void nfs_local_enable(struct nfs_client *clp)
+{
+ spin_lock(&clp->cl_localio_lock);
+ set_bit(NFS_CS_LOCAL_IO, &clp->cl_flags);
+ trace_nfs_local_enable(clp);
+ spin_unlock(&clp->cl_localio_lock);
+}
+
+/*
+ * nfs_local_disable - disable local i/o for an nfs_client
+ */
+void nfs_local_disable(struct nfs_client *clp)
+{
+ spin_lock(&clp->cl_localio_lock);
+ if (test_and_clear_bit(NFS_CS_LOCAL_IO, &clp->cl_flags)) {
+ trace_nfs_local_disable(clp);
+ nfs_uuid_invalidate_one_client(&clp->cl_uuid);
+ }
+ spin_unlock(&clp->cl_localio_lock);
+}
+
+/*
+ * nfs_init_localioclient - Initialise an NFS localio client connection
+ */
+static struct rpc_clnt *nfs_init_localioclient(struct nfs_client *clp)
+{
+ struct rpc_clnt *rpcclient_localio;
+
+ rpcclient_localio = rpc_bind_new_program(clp->cl_rpcclient,
+ &nfslocalio_program, 1);
+
+ dprintk_rcu("%s: server (%s) %s NFS LOCALIO.\n",
+ __func__, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ (IS_ERR(rpcclient_localio) ? "does not support" : "supports"));
+
+ return rpcclient_localio;
+}
+
+static bool nfs_server_uuid_is_local(struct nfs_client *clp)
+{
+ u8 uuid[UUID_SIZE];
+ struct rpc_message msg = {
+ .rpc_argp = &uuid,
+ };
+ struct rpc_clnt *rpcclient_localio;
+ int status;
+
+ rpcclient_localio = nfs_init_localioclient(clp);
+ if (IS_ERR(rpcclient_localio))
+ return false;
+
+ export_uuid(uuid, &clp->cl_uuid.uuid);
+
+ msg.rpc_proc = &nfs_localio_procedures[LOCALIOPROC_UUID_IS_LOCAL];
+ status = rpc_call_sync(rpcclient_localio, &msg, 0);
+ dprintk("%s: NFS reply UUID_IS_LOCAL: status=%d\n",
+ __func__, status);
+ rpc_shutdown_client(rpcclient_localio);
+
+ /* Server is only local if it initialized required struct members */
+ if (status || !clp->cl_uuid.net || !clp->cl_uuid.dom)
+ return false;
+
+ return true;
+}
+
+/*
+ * nfs_local_probe - probe local i/o support for an nfs_server and nfs_client
+ * - called after alloc_client and init_client (so cl_rpcclient exists)
+ * - this function is idempotent, it can be called for old or new clients
+ */
+void nfs_local_probe(struct nfs_client *clp)
+{
+ /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
+ if (!localio_enabled ||
+ clp->cl_rpcclient->cl_auth->au_flavor != RPC_AUTH_UNIX) {
+ nfs_local_disable(clp);
+ return;
+ }
+
+ if (nfs_client_is_local(clp)) {
+ /* If already enabled, disable and re-enable */
+ nfs_local_disable(clp);
+ }
+
+ nfs_uuid_begin(&clp->cl_uuid);
+ if (nfs_server_uuid_is_local(clp))
+ nfs_local_enable(clp);
+ nfs_uuid_end(&clp->cl_uuid);
+}
+EXPORT_SYMBOL_GPL(nfs_local_probe);
+
+/*
+ * nfs_local_open_fh - open a local filehandle in terms of nfsd_file
+ *
+ * Returns a pointer to a struct nfsd_file or NULL
+ */
+struct nfsd_file *
+nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
+ struct nfs_fh *fh, const fmode_t mode)
+{
+ struct nfsd_file *localio;
+ int status;
+
+ if (!nfs_server_is_local(clp))
+ return NULL;
+ if (mode & ~(FMODE_READ | FMODE_WRITE))
+ return NULL;
+
+ localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
+ cred, fh, mode);
+ if (IS_ERR(localio)) {
+ status = PTR_ERR(localio);
+ trace_nfs_local_open_fh(fh, mode, status);
+ switch (status) {
+ case -ENOMEM:
+ case -ENXIO:
+ case -ENOENT:
+ /* Revalidate localio, will disable if unsupported */
+ nfs_local_probe(clp);
+ }
+ return NULL;
+ }
+ return localio;
+}
+EXPORT_SYMBOL_GPL(nfs_local_open_fh);
+
+static struct bio_vec *
+nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
+ unsigned int npages, gfp_t flags)
+{
+ struct bio_vec *bvec, *p;
+
+ bvec = kmalloc_array(npages, sizeof(*bvec), flags);
+ if (bvec != NULL) {
+ for (p = bvec; npages > 0; p++, pagevec++, npages--) {
+ p->bv_page = *pagevec;
+ p->bv_len = PAGE_SIZE;
+ p->bv_offset = 0;
+ }
+ }
+ return bvec;
+}
+
+static void
+nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
+{
+ kfree(iocb->bvec);
+ kfree(iocb);
+}
+
+static struct nfs_local_kiocb *
+nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
+ struct nfsd_file *localio, gfp_t flags)
+{
+ struct nfs_local_kiocb *iocb;
+
+ iocb = kmalloc(sizeof(*iocb), flags);
+ if (iocb == NULL)
+ return NULL;
+ iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
+ hdr->page_array.npages, flags);
+ if (iocb->bvec == NULL) {
+ kfree(iocb);
+ return NULL;
+ }
+ init_sync_kiocb(&iocb->kiocb, nfs_to->nfsd_file_file(localio));
+ iocb->kiocb.ki_pos = hdr->args.offset;
+ iocb->localio = localio;
+ iocb->hdr = hdr;
+ iocb->kiocb.ki_flags &= ~IOCB_APPEND;
+ return iocb;
+}
+
+static void
+nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
+ iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
+ hdr->args.count + hdr->args.pgbase);
+ if (hdr->args.pgbase != 0)
+ iov_iter_advance(i, hdr->args.pgbase);
+}
+
+static void
+nfs_local_hdr_release(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops)
+{
+ call_ops->rpc_call_done(&hdr->task, hdr);
+ call_ops->rpc_release(hdr);
+}
+
+static void
+nfs_local_pgio_init(struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops)
+{
+ hdr->task.tk_ops = call_ops;
+ if (!hdr->task.tk_start)
+ hdr->task.tk_start = ktime_get();
+}
+
+static void
+nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
+{
+ if (status >= 0) {
+ hdr->res.count = status;
+ hdr->res.op_status = NFS4_OK;
+ hdr->task.tk_status = 0;
+ } else {
+ hdr->res.op_status = nfs4_stat_to_errno(status);
+ hdr->task.tk_status = status;
+ }
+}
+
+static void
+nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
+ nfs_to->nfsd_file_put_local(iocb->localio);
+ nfs_local_iocb_free(iocb);
+ nfs_local_hdr_release(hdr, hdr->task.tk_ops);
+}
+
+static void
+nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ struct file *filp = iocb->kiocb.ki_filp;
+
+ nfs_local_pgio_done(hdr, status);
+
+ if (hdr->res.count != hdr->args.count ||
+ hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
+ hdr->res.eof = true;
+
+ dprintk("%s: read %ld bytes eof %d.\n", __func__,
+ status > 0 ? status : 0, hdr->res.eof);
+}
+
+static void nfs_local_call_read(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
+ const struct cred *save_cred;
+ struct iov_iter iter;
+ ssize_t status;
+
+ save_cred = override_creds(filp->f_cred);
+
+ nfs_local_iter_init(&iter, iocb, READ);
+
+ status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+ WARN_ON_ONCE(status == -EIOCBQUEUED);
+
+ nfs_local_read_done(iocb, status);
+ nfs_local_pgio_release(iocb);
+
+ revert_creds(save_cred);
+}
+
+static int
+nfs_do_local_read(struct nfs_pgio_header *hdr,
+ struct nfsd_file *localio,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_kiocb *iocb;
+
+ dprintk("%s: vfs_read count=%u pos=%llu\n",
+ __func__, hdr->args.count, hdr->args.offset);
+
+ iocb = nfs_local_iocb_alloc(hdr, localio, GFP_KERNEL);
+ if (iocb == NULL)
+ return -ENOMEM;
+
+ nfs_local_pgio_init(hdr, call_ops);
+ hdr->res.eof = false;
+
+ INIT_WORK(&iocb->work, nfs_local_call_read);
+ queue_work(nfslocaliod_workqueue, &iocb->work);
+
+ return 0;
+}
+
+static void
+nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ u32 *verf = (u32 *)verifier->data;
+ int seq = 0;
+
+ do {
+ read_seqbegin_or_lock(&clp->cl_boot_lock, &seq);
+ verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec;
+ verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec;
+ } while (need_seqretry(&clp->cl_boot_lock, seq));
+ done_seqretry(&clp->cl_boot_lock, seq);
+}
+
+static void
+nfs_reset_boot_verifier(struct inode *inode)
+{
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ write_seqlock(&clp->cl_boot_lock);
+ ktime_get_real_ts64(&clp->cl_nfssvc_boot);
+ write_sequnlock(&clp->cl_boot_lock);
+}
+
+static void
+nfs_set_local_verifier(struct inode *inode,
+ struct nfs_writeverf *verf,
+ enum nfs3_stable_how how)
+{
+ nfs_copy_boot_verifier(&verf->verifier, inode);
+ verf->committed = how;
+}
+
+/* Factored out from fs/nfsd/vfs.h:fh_getattr() */
+static int __vfs_getattr(struct path *p, struct kstat *stat, int version)
+{
+ u32 request_mask = STATX_BASIC_STATS;
+
+ if (version == 4)
+ request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
+ return vfs_getattr(p, stat, request_mask, AT_STATX_SYNC_AS_STAT);
+}
+
+/* Copied from fs/nfsd/nfsfh.c:nfsd4_change_attribute() */
+static u64 __nfsd4_change_attribute(const struct kstat *stat,
+ const struct inode *inode)
+{
+ u64 chattr;
+
+ if (stat->result_mask & STATX_CHANGE_COOKIE) {
+ chattr = stat->change_cookie;
+ if (S_ISREG(inode->i_mode) &&
+ !(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) {
+ chattr += (u64)stat->ctime.tv_sec << 30;
+ chattr += stat->ctime.tv_nsec;
+ }
+ } else {
+ chattr = time_to_chattr(&stat->ctime);
+ }
+ return chattr;
+}
+
+static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb)
+{
+ struct kstat stat;
+ struct file *filp = iocb->kiocb.ki_filp;
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ struct nfs_fattr *fattr = hdr->res.fattr;
+ int version = NFS_PROTO(hdr->inode)->version;
+
+ if (unlikely(!fattr) || __vfs_getattr(&filp->f_path, &stat, version))
+ return;
+
+ fattr->valid = (NFS_ATTR_FATTR_FILEID |
+ NFS_ATTR_FATTR_CHANGE |
+ NFS_ATTR_FATTR_SIZE |
+ NFS_ATTR_FATTR_ATIME |
+ NFS_ATTR_FATTR_MTIME |
+ NFS_ATTR_FATTR_CTIME |
+ NFS_ATTR_FATTR_SPACE_USED);
+
+ fattr->fileid = stat.ino;
+ fattr->size = stat.size;
+ fattr->atime = stat.atime;
+ fattr->mtime = stat.mtime;
+ fattr->ctime = stat.ctime;
+ if (version == 4) {
+ fattr->change_attr =
+ __nfsd4_change_attribute(&stat, file_inode(filp));
+ } else
+ fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
+ fattr->du.nfs3.used = stat.blocks << 9;
+}
+
+static void
+nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
+{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+ struct inode *inode = hdr->inode;
+
+ dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
+
+ /* Handle short writes as if they are ENOSPC */
+ if (status > 0 && status < hdr->args.count) {
+ hdr->mds_offset += status;
+ hdr->args.offset += status;
+ hdr->args.pgbase += status;
+ hdr->args.count -= status;
+ nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
+ status = -ENOSPC;
+ }
+ if (status < 0)
+ nfs_reset_boot_verifier(inode);
+ else if (nfs_should_remove_suid(inode)) {
+ /* Deal with the suid/sgid bit corner case */
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ spin_unlock(&inode->i_lock);
+ }
+ nfs_local_pgio_done(hdr, status);
+}
+
+static void nfs_local_call_write(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
+ unsigned long old_flags = current->flags;
+ const struct cred *save_cred;
+ struct iov_iter iter;
+ ssize_t status;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ save_cred = override_creds(filp->f_cred);
+
+ nfs_local_iter_init(&iter, iocb, WRITE);
+
+ file_start_write(filp);
+ status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+ file_end_write(filp);
+ WARN_ON_ONCE(status == -EIOCBQUEUED);
+
+ nfs_local_write_done(iocb, status);
+ nfs_local_vfs_getattr(iocb);
+ nfs_local_pgio_release(iocb);
+
+ revert_creds(save_cred);
+ current->flags = old_flags;
+}
+
+static int
+nfs_do_local_write(struct nfs_pgio_header *hdr,
+ struct nfsd_file *localio,
+ const struct rpc_call_ops *call_ops)
+{
+ struct nfs_local_kiocb *iocb;
+
+ dprintk("%s: vfs_write count=%u pos=%llu %s\n",
+ __func__, hdr->args.count, hdr->args.offset,
+ (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable");
+
+ iocb = nfs_local_iocb_alloc(hdr, localio, GFP_NOIO);
+ if (iocb == NULL)
+ return -ENOMEM;
+
+ switch (hdr->args.stable) {
+ default:
+ break;
+ case NFS_DATA_SYNC:
+ iocb->kiocb.ki_flags |= IOCB_DSYNC;
+ break;
+ case NFS_FILE_SYNC:
+ iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
+ }
+ nfs_local_pgio_init(hdr, call_ops);
+
+ nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
+
+ INIT_WORK(&iocb->work, nfs_local_call_write);
+ queue_work(nfslocaliod_workqueue, &iocb->work);
+
+ return 0;
+}
+
+int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
+ struct nfs_pgio_header *hdr,
+ const struct rpc_call_ops *call_ops)
+{
+ int status = 0;
+ struct file *filp = nfs_to->nfsd_file_file(localio);
+
+ if (!hdr->args.count)
+ return 0;
+ /* Don't support filesystems without read_iter/write_iter */
+ if (!filp->f_op->read_iter || !filp->f_op->write_iter) {
+ nfs_local_disable(clp);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ switch (hdr->rw_mode) {
+ case FMODE_READ:
+ status = nfs_do_local_read(hdr, localio, call_ops);
+ break;
+ case FMODE_WRITE:
+ status = nfs_do_local_write(hdr, localio, call_ops);
+ break;
+ default:
+ dprintk("%s: invalid mode: %d\n", __func__,
+ hdr->rw_mode);
+ status = -EINVAL;
+ }
+out:
+ if (status != 0) {
+ nfs_to->nfsd_file_put_local(localio);
+ hdr->task.tk_status = status;
+ nfs_local_hdr_release(hdr, call_ops);
+ }
+ return status;
+}
+
+static void
+nfs_local_init_commit(struct nfs_commit_data *data,
+ const struct rpc_call_ops *call_ops)
+{
+ data->task.tk_ops = call_ops;
+}
+
+static int
+nfs_local_run_commit(struct file *filp, struct nfs_commit_data *data)
+{
+ loff_t start = data->args.offset;
+ loff_t end = LLONG_MAX;
+
+ if (data->args.count > 0) {
+ end = start + data->args.count - 1;
+ if (end < start)
+ end = LLONG_MAX;
+ }
+
+ dprintk("%s: commit %llu - %llu\n", __func__, start, end);
+ return vfs_fsync_range(filp, start, end, 0);
+}
+
+static void
+nfs_local_commit_done(struct nfs_commit_data *data, int status)
+{
+ if (status >= 0) {
+ nfs_set_local_verifier(data->inode,
+ data->res.verf,
+ NFS_FILE_SYNC);
+ data->res.op_status = NFS4_OK;
+ data->task.tk_status = 0;
+ } else {
+ nfs_reset_boot_verifier(data->inode);
+ data->res.op_status = nfs4_stat_to_errno(status);
+ data->task.tk_status = status;
+ }
+}
+
+static void
+nfs_local_release_commit_data(struct nfsd_file *localio,
+ struct nfs_commit_data *data,
+ const struct rpc_call_ops *call_ops)
+{
+ nfs_to->nfsd_file_put_local(localio);
+ call_ops->rpc_call_done(&data->task, data);
+ call_ops->rpc_release(data);
+}
+
+static struct nfs_local_fsync_ctx *
+nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data,
+ struct nfsd_file *localio, gfp_t flags)
+{
+ struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags);
+
+ if (ctx != NULL) {
+ ctx->localio = localio;
+ ctx->data = data;
+ INIT_WORK(&ctx->work, nfs_local_fsync_work);
+ kref_init(&ctx->kref);
+ ctx->done = NULL;
+ }
+ return ctx;
+}
+
+static void
+nfs_local_fsync_ctx_kref_free(struct kref *kref)
+{
+ kfree(container_of(kref, struct nfs_local_fsync_ctx, kref));
+}
+
+static void
+nfs_local_fsync_ctx_put(struct nfs_local_fsync_ctx *ctx)
+{
+ kref_put(&ctx->kref, nfs_local_fsync_ctx_kref_free);
+}
+
+static void
+nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx)
+{
+ nfs_local_release_commit_data(ctx->localio, ctx->data,
+ ctx->data->task.tk_ops);
+ nfs_local_fsync_ctx_put(ctx);
+}
+
+static void
+nfs_local_fsync_work(struct work_struct *work)
+{
+ struct nfs_local_fsync_ctx *ctx;
+ int status;
+
+ ctx = container_of(work, struct nfs_local_fsync_ctx, work);
+
+ status = nfs_local_run_commit(nfs_to->nfsd_file_file(ctx->localio),
+ ctx->data);
+ nfs_local_commit_done(ctx->data, status);
+ if (ctx->done != NULL)
+ complete(ctx->done);
+ nfs_local_fsync_ctx_free(ctx);
+}
+
+int nfs_local_commit(struct nfsd_file *localio,
+ struct nfs_commit_data *data,
+ const struct rpc_call_ops *call_ops, int how)
+{
+ struct nfs_local_fsync_ctx *ctx;
+
+ ctx = nfs_local_fsync_ctx_alloc(data, localio, GFP_KERNEL);
+ if (!ctx) {
+ nfs_local_commit_done(data, -ENOMEM);
+ nfs_local_release_commit_data(localio, data, call_ops);
+ return -ENOMEM;
+ }
+
+ nfs_local_init_commit(data, call_ops);
+ kref_get(&ctx->kref);
+ if (how & FLUSH_SYNC) {
+ DECLARE_COMPLETION_ONSTACK(done);
+ ctx->done = &done;
+ queue_work(nfsiod_workqueue, &ctx->work);
+ wait_for_completion(&done);
+ } else
+ queue_work(nfsiod_workqueue, &ctx->work);
+ nfs_local_fsync_ctx_put(ctx);
+ return 0;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c19093814296..6e75c6c2d234 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -22,14 +22,12 @@
#include <linux/nfs.h>
#include <linux/nfs2.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_common.h>
#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
-/* Mapping from NFS error code to "errno" error code. */
-#define errno_NFSERR_IO EIO
-
/*
* Declare the space requirements for NFS arguments and replies as
* number of 32bit-words
@@ -64,8 +62,6 @@
#define NFS_readdirres_sz (1+NFS_pagepad_sz)
#define NFS_statfsres_sz (1+NFS_info_sz)
-static int nfs_stat_to_errno(enum nfs_stat);
-
/*
* Encode/decode NFSv2 basic data types
*
@@ -1054,70 +1050,6 @@ out_default:
return nfs_stat_to_errno(status);
}
-
-/*
- * We need to translate between nfs status return values and
- * the local errno values which may not be the same.
- */
-static const struct {
- int stat;
- int errno;
-} nfs_errtbl[] = {
- { NFS_OK, 0 },
- { NFSERR_PERM, -EPERM },
- { NFSERR_NOENT, -ENOENT },
- { NFSERR_IO, -errno_NFSERR_IO},
- { NFSERR_NXIO, -ENXIO },
-/* { NFSERR_EAGAIN, -EAGAIN }, */
- { NFSERR_ACCES, -EACCES },
- { NFSERR_EXIST, -EEXIST },
- { NFSERR_XDEV, -EXDEV },
- { NFSERR_NODEV, -ENODEV },
- { NFSERR_NOTDIR, -ENOTDIR },
- { NFSERR_ISDIR, -EISDIR },
- { NFSERR_INVAL, -EINVAL },
- { NFSERR_FBIG, -EFBIG },
- { NFSERR_NOSPC, -ENOSPC },
- { NFSERR_ROFS, -EROFS },
- { NFSERR_MLINK, -EMLINK },
- { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
- { NFSERR_NOTEMPTY, -ENOTEMPTY },
- { NFSERR_DQUOT, -EDQUOT },
- { NFSERR_STALE, -ESTALE },
- { NFSERR_REMOTE, -EREMOTE },
-#ifdef EWFLUSH
- { NFSERR_WFLUSH, -EWFLUSH },
-#endif
- { NFSERR_BADHANDLE, -EBADHANDLE },
- { NFSERR_NOT_SYNC, -ENOTSYNC },
- { NFSERR_BAD_COOKIE, -EBADCOOKIE },
- { NFSERR_NOTSUPP, -ENOTSUPP },
- { NFSERR_TOOSMALL, -ETOOSMALL },
- { NFSERR_SERVERFAULT, -EREMOTEIO },
- { NFSERR_BADTYPE, -EBADTYPE },
- { NFSERR_JUKEBOX, -EJUKEBOX },
- { -1, -EIO }
-};
-
-/**
- * nfs_stat_to_errno - convert an NFS status code to a local errno
- * @status: NFS status code to convert
- *
- * Returns a local errno value, or -EIO if the NFS status code is
- * not recognized. This function is used jointly by NFSv2 and NFSv3.
- */
-static int nfs_stat_to_errno(enum nfs_stat status)
-{
- int i;
-
- for (i = 0; nfs_errtbl[i].stat != -1; i++) {
- if (nfs_errtbl[i].stat == (int)status)
- return nfs_errtbl[i].errno;
- }
- dprintk("NFS: Unrecognized nfs status value: %u\n", status);
- return nfs_errtbl[i].errno;
-}
-
#define PROC(proc, argtype, restype, timer) \
[NFSPROC_##proc] = { \
.p_proc = NFSPROC_##proc, \
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 60f032be805a..4ae01c10b7e2 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -21,14 +21,13 @@
#include <linux/nfs3.h>
#include <linux/nfs_fs.h>
#include <linux/nfsacl.h>
+#include <linux/nfs_common.h>
+
#include "nfstrace.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_XDR
-/* Mapping from NFS error code to "errno" error code. */
-#define errno_NFSERR_IO EIO
-
/*
* Declare the space requirements for NFS arguments and replies as
* number of 32bit-words
@@ -91,8 +90,6 @@
NFS3_pagepad_sz)
#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz)
-static int nfs3_stat_to_errno(enum nfs_stat);
-
/*
* Map file type to S_IFMT bits
*/
@@ -1406,7 +1403,7 @@ static int nfs3_xdr_dec_getattr3res(struct rpc_rqst *req,
out:
return error;
out_default:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1445,7 +1442,7 @@ static int nfs3_xdr_dec_setattr3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1495,7 +1492,7 @@ out_default:
error = decode_post_op_attr(xdr, result->dir_attr, userns);
if (unlikely(error))
goto out;
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1537,7 +1534,7 @@ static int nfs3_xdr_dec_access3res(struct rpc_rqst *req,
out:
return error;
out_default:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1578,7 +1575,7 @@ static int nfs3_xdr_dec_readlink3res(struct rpc_rqst *req,
out:
return error;
out_default:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1658,7 +1655,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1728,7 +1725,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1795,7 +1792,7 @@ out_default:
error = decode_wcc_data(xdr, result->dir_attr, userns);
if (unlikely(error))
goto out;
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1835,7 +1832,7 @@ static int nfs3_xdr_dec_remove3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1881,7 +1878,7 @@ static int nfs3_xdr_dec_rename3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -1926,7 +1923,7 @@ static int nfs3_xdr_dec_link3res(struct rpc_rqst *req, struct xdr_stream *xdr,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/**
@@ -2101,7 +2098,7 @@ out_default:
error = decode_post_op_attr(xdr, result->dir_attr, rpc_rqst_userns(req));
if (unlikely(error))
goto out;
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -2167,7 +2164,7 @@ static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -2243,7 +2240,7 @@ static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -2304,7 +2301,7 @@ static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
/*
@@ -2350,7 +2347,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
out:
return error;
out_status:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
#ifdef CONFIG_NFS_V3_ACL
@@ -2416,7 +2413,7 @@ static int nfs3_xdr_dec_getacl3res(struct rpc_rqst *req,
out:
return error;
out_default:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
@@ -2435,76 +2432,11 @@ static int nfs3_xdr_dec_setacl3res(struct rpc_rqst *req,
out:
return error;
out_default:
- return nfs3_stat_to_errno(status);
+ return nfs_stat_to_errno(status);
}
#endif /* CONFIG_NFS_V3_ACL */
-
-/*
- * We need to translate between nfs status return values and
- * the local errno values which may not be the same.
- */
-static const struct {
- int stat;
- int errno;
-} nfs_errtbl[] = {
- { NFS_OK, 0 },
- { NFSERR_PERM, -EPERM },
- { NFSERR_NOENT, -ENOENT },
- { NFSERR_IO, -errno_NFSERR_IO},
- { NFSERR_NXIO, -ENXIO },
-/* { NFSERR_EAGAIN, -EAGAIN }, */
- { NFSERR_ACCES, -EACCES },
- { NFSERR_EXIST, -EEXIST },
- { NFSERR_XDEV, -EXDEV },
- { NFSERR_NODEV, -ENODEV },
- { NFSERR_NOTDIR, -ENOTDIR },
- { NFSERR_ISDIR, -EISDIR },
- { NFSERR_INVAL, -EINVAL },
- { NFSERR_FBIG, -EFBIG },
- { NFSERR_NOSPC, -ENOSPC },
- { NFSERR_ROFS, -EROFS },
- { NFSERR_MLINK, -EMLINK },
- { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
- { NFSERR_NOTEMPTY, -ENOTEMPTY },
- { NFSERR_DQUOT, -EDQUOT },
- { NFSERR_STALE, -ESTALE },
- { NFSERR_REMOTE, -EREMOTE },
-#ifdef EWFLUSH
- { NFSERR_WFLUSH, -EWFLUSH },
-#endif
- { NFSERR_BADHANDLE, -EBADHANDLE },
- { NFSERR_NOT_SYNC, -ENOTSYNC },
- { NFSERR_BAD_COOKIE, -EBADCOOKIE },
- { NFSERR_NOTSUPP, -ENOTSUPP },
- { NFSERR_TOOSMALL, -ETOOSMALL },
- { NFSERR_SERVERFAULT, -EREMOTEIO },
- { NFSERR_BADTYPE, -EBADTYPE },
- { NFSERR_JUKEBOX, -EJUKEBOX },
- { -1, -EIO }
-};
-
-/**
- * nfs3_stat_to_errno - convert an NFS status code to a local errno
- * @status: NFS status code to convert
- *
- * Returns a local errno value, or -EIO if the NFS status code is
- * not recognized. This function is used jointly by NFSv2 and NFSv3.
- */
-static int nfs3_stat_to_errno(enum nfs_stat status)
-{
- int i;
-
- for (i = 0; nfs_errtbl[i].stat != -1; i++) {
- if (nfs_errtbl[i].stat == (int)status)
- return nfs_errtbl[i].errno;
- }
- dprintk("NFS: Unrecognized nfs status value: %u\n", status);
- return nfs_errtbl[i].errno;
-}
-
-
#define PROC(proc, argtype, restype, timer) \
[NFS3PROC_##proc] = { \
.p_proc = NFS3PROC_##proc, \
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c2045a2a9d0f..7d383d29a995 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -83,7 +83,7 @@ struct nfs4_minor_version_ops {
#define NFS_SEQID_CONFIRMED 1
struct nfs_seqid_counter {
ktime_t create_time;
- int owner_id;
+ u64 owner_id;
int flags;
u32 counter;
spinlock_t lock; /* Protects the list */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b8ffbe52ba15..cd2fbde2e6d7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3904,6 +3904,18 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_OPEN_ARGUMENTS - 1UL)
+#define FATTR4_WORD2_NFS42_TIME_DELEG_MASK \
+ (FATTR4_WORD2_TIME_DELEG_MODIFY|FATTR4_WORD2_TIME_DELEG_ACCESS)
+static bool nfs4_server_delegtime_capable(struct nfs4_server_caps_res *res)
+{
+ u32 share_access_want = res->open_caps.oa_share_access_want[0];
+ u32 attr_bitmask = res->attr_bitmask[2];
+
+ return (share_access_want & NFS4_SHARE_WANT_DELEG_TIMESTAMPS) &&
+ ((attr_bitmask & FATTR4_WORD2_NFS42_TIME_DELEG_MASK) ==
+ FATTR4_WORD2_NFS42_TIME_DELEG_MASK);
+}
+
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
u32 minorversion = server->nfs_client->cl_minorversion;
@@ -3982,8 +3994,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
#endif
if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS)
server->caps |= NFS_CAP_FS_LOCATIONS;
- if (res.attr_bitmask[2] & FATTR4_WORD2_TIME_DELEG_MODIFY)
- server->caps |= NFS_CAP_DELEGTIME;
if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
@@ -4011,6 +4021,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
if (res.open_caps.oa_share_access_want[0] &
NFS4_SHARE_WANT_OPEN_XOR_DELEGATION)
server->caps |= NFS_CAP_OPEN_XOR;
+ if (nfs4_server_delegtime_capable(&res))
+ server->caps |= NFS_CAP_DELEGTIME;
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 877f682b45f2..581864a15888 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -501,11 +501,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
sp = kzalloc(sizeof(*sp), gfp_flags);
if (!sp)
return NULL;
- sp->so_seqid.owner_id = ida_alloc(&server->openowner_id, gfp_flags);
- if (sp->so_seqid.owner_id < 0) {
- kfree(sp);
- return NULL;
- }
+ sp->so_seqid.owner_id = atomic64_inc_return(&server->owner_ctr);
sp->so_server = server;
sp->so_cred = get_cred(cred);
spin_lock_init(&sp->so_lock);
@@ -536,7 +532,6 @@ static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
{
nfs4_destroy_seqid_counter(&sp->so_seqid);
put_cred(sp->so_cred);
- ida_free(&sp->so_server->openowner_id, sp->so_seqid.owner_id);
kfree(sp);
}
@@ -879,19 +874,13 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
refcount_set(&lsp->ls_count, 1);
lsp->ls_state = state;
lsp->ls_owner = owner;
- lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
- if (lsp->ls_seqid.owner_id < 0)
- goto out_free;
+ lsp->ls_seqid.owner_id = atomic64_inc_return(&server->owner_ctr);
INIT_LIST_HEAD(&lsp->ls_locks);
return lsp;
-out_free:
- kfree(lsp);
- return NULL;
}
void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
- ida_free(&server->lockowner_id, lsp->ls_seqid.owner_id);
nfs4_destroy_seqid_counter(&lsp->ls_seqid);
kfree(lsp);
}
@@ -1957,6 +1946,7 @@ restart:
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
status = nfs4_recovery_handle_error(clp, status);
+ nfs4_free_state_owners(&freeme);
return (status != 0) ? status : -EAGAIN;
}
@@ -2023,6 +2013,12 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
nfs_mark_client_ready(clp, -EPERM);
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
return -EPERM;
+ case -ETIMEDOUT:
+ if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
+ nfs_mark_client_ready(clp, -EIO);
+ return -EIO;
+ }
+ fallthrough;
case -EACCES:
case -NFS4ERR_DELAY:
case -EAGAIN:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7704a4509676..e8ac3f615f93 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
+#include <linux/nfs_common.h>
#include "nfs4_fs.h"
#include "nfs4trace.h"
@@ -63,11 +64,7 @@
#define NFSDBG_FACILITY NFSDBG_XDR
-/* Mapping from NFS error code to "errno" error code. */
-#define errno_NFSERR_IO EIO
-
struct compound_hdr;
-static int nfs4_stat_to_errno(int);
static void encode_layoutget(struct xdr_stream *xdr,
const struct nfs4_layoutget_args *args,
struct compound_hdr *hdr);
@@ -975,11 +972,6 @@ static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
return p;
}
-static void encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
-{
- WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
-}
-
static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
WARN_ON_ONCE(xdr_stream_encode_opaque(xdr, str, len) < 0);
@@ -1424,12 +1416,12 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
*/
encode_nfs4_seqid(xdr, arg->seqid);
encode_share_access(xdr, arg->share_access);
- p = reserve_space(xdr, 36);
+ p = reserve_space(xdr, 40);
p = xdr_encode_hyper(p, arg->clientid);
- *p++ = cpu_to_be32(24);
+ *p++ = cpu_to_be32(28);
p = xdr_encode_opaque_fixed(p, "open id:", 8);
*p++ = cpu_to_be32(arg->server->s_dev);
- *p++ = cpu_to_be32(arg->id.uniquifier);
+ p = xdr_encode_hyper(p, arg->id.uniquifier);
xdr_encode_hyper(p, arg->id.create_time);
}
@@ -3447,7 +3439,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
- dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
+ dprintk("%s: link support=%s\n", __func__, str_false_true(*res == 0));
return 0;
}
@@ -3465,7 +3457,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
- dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
+ dprintk("%s: symlink support=%s\n", __func__, str_false_true(*res == 0));
return 0;
}
@@ -3607,7 +3599,7 @@ static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE;
}
- dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true");
+ dprintk("%s: case_insensitive=%s\n", __func__, str_false_true(*res == 0));
return 0;
}
@@ -3625,7 +3617,7 @@ static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap,
*res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING;
}
- dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true");
+ dprintk("%s: case_preserving=%s\n", __func__, str_false_true(*res == 0));
return 0;
}
@@ -4333,8 +4325,7 @@ static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
*res = be32_to_cpup(p);
bitmap[2] &= ~FATTR4_WORD2_XATTR_SUPPORT;
}
- dprintk("%s: XATTR support=%s\n", __func__,
- *res == 0 ? "false" : "true");
+ dprintk("%s: XATTR support=%s\n", __func__, str_false_true(*res == 0));
return 0;
}
@@ -4409,14 +4400,6 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access)
return 0;
}
-static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
-{
- ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len);
- if (unlikely(ret < 0))
- return -EIO;
- return 0;
-}
-
static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
@@ -7621,72 +7604,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
return 0;
}
-/*
- * We need to translate between nfs status return values and
- * the local errno values which may not be the same.
- */
-static struct {
- int stat;
- int errno;
-} nfs_errtbl[] = {
- { NFS4_OK, 0 },
- { NFS4ERR_PERM, -EPERM },
- { NFS4ERR_NOENT, -ENOENT },
- { NFS4ERR_IO, -errno_NFSERR_IO},
- { NFS4ERR_NXIO, -ENXIO },
- { NFS4ERR_ACCESS, -EACCES },
- { NFS4ERR_EXIST, -EEXIST },
- { NFS4ERR_XDEV, -EXDEV },
- { NFS4ERR_NOTDIR, -ENOTDIR },
- { NFS4ERR_ISDIR, -EISDIR },
- { NFS4ERR_INVAL, -EINVAL },
- { NFS4ERR_FBIG, -EFBIG },
- { NFS4ERR_NOSPC, -ENOSPC },
- { NFS4ERR_ROFS, -EROFS },
- { NFS4ERR_MLINK, -EMLINK },
- { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
- { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
- { NFS4ERR_DQUOT, -EDQUOT },
- { NFS4ERR_STALE, -ESTALE },
- { NFS4ERR_BADHANDLE, -EBADHANDLE },
- { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
- { NFS4ERR_NOTSUPP, -ENOTSUPP },
- { NFS4ERR_TOOSMALL, -ETOOSMALL },
- { NFS4ERR_SERVERFAULT, -EREMOTEIO },
- { NFS4ERR_BADTYPE, -EBADTYPE },
- { NFS4ERR_LOCKED, -EAGAIN },
- { NFS4ERR_SYMLINK, -ELOOP },
- { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
- { NFS4ERR_DEADLOCK, -EDEADLK },
- { NFS4ERR_NOXATTR, -ENODATA },
- { NFS4ERR_XATTR2BIG, -E2BIG },
- { -1, -EIO }
-};
-
-/*
- * Convert an NFS error code to a local one.
- * This one is used jointly by NFSv2 and NFSv3.
- */
-static int
-nfs4_stat_to_errno(int stat)
-{
- int i;
- for (i = 0; nfs_errtbl[i].stat != -1; i++) {
- if (nfs_errtbl[i].stat == stat)
- return nfs_errtbl[i].errno;
- }
- if (stat <= 10000 || stat > 10100) {
- /* The server is looney tunes. */
- return -EREMOTEIO;
- }
- /* If we cannot translate the error, the recovery routines should
- * handle it.
- * Note: remaining NFSv4 error codes have values > 10000, so should
- * not conflict with native Linux error codes.
- */
- return -stat;
-}
-
#ifdef CONFIG_NFS_V4_2
#include "nfs42xdr.c"
#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 352fdaed4075..1eab98c277fa 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1685,6 +1685,67 @@ TRACE_EVENT(nfs_mount_path,
TP_printk("path='%s'", __get_str(path))
);
+TRACE_EVENT(nfs_local_open_fh,
+ TP_PROTO(
+ const struct nfs_fh *fh,
+ fmode_t fmode,
+ int error
+ ),
+
+ TP_ARGS(fh, fmode, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(u32, fhandle)
+ __field(unsigned int, fmode)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fh);
+ __entry->fmode = (__force unsigned int)fmode;
+ ),
+
+ TP_printk(
+ "error=%d fhandle=0x%08x mode=%s",
+ __entry->error,
+ __entry->fhandle,
+ show_fs_fmode_flags(__entry->fmode)
+ )
+);
+
+DECLARE_EVENT_CLASS(nfs_local_client_event,
+ TP_PROTO(
+ const struct nfs_client *clp
+ ),
+
+ TP_ARGS(clp),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, protocol)
+ __string(server, clp->cl_hostname)
+ ),
+
+ TP_fast_assign(
+ __entry->protocol = clp->rpc_ops->version;
+ __assign_str(server);
+ ),
+
+ TP_printk(
+ "server=%s NFSv%u", __get_str(server), __entry->protocol
+ )
+);
+
+#define DEFINE_NFS_LOCAL_CLIENT_EVENT(name) \
+ DEFINE_EVENT(nfs_local_client_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp \
+ ), \
+ TP_ARGS(clp))
+
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_enable);
+DEFINE_NFS_LOCAL_CLIENT_EVENT(nfs_local_disable);
+
DECLARE_EVENT_CLASS(nfs_xdr_event,
TP_PROTO(
const struct xdr_stream *xdr,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 04124f226665..e27c07bd8929 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -731,7 +731,8 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
const struct cred *cred, const struct nfs_rpc_ops *rpc_ops,
- const struct rpc_call_ops *call_ops, int how, int flags)
+ const struct rpc_call_ops *call_ops, int how, int flags,
+ struct nfsd_file *localio)
{
struct rpc_task *task;
struct rpc_message msg = {
@@ -761,6 +762,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
hdr->args.count,
(unsigned long long)hdr->args.offset);
+ if (localio)
+ return nfs_local_doio(NFS_SERVER(hdr->inode)->nfs_client,
+ localio, hdr, call_ops);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -953,6 +958,12 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0) {
+ struct nfs_client *clp = NFS_SERVER(hdr->inode)->nfs_client;
+
+ struct nfsd_file *localio =
+ nfs_local_open_fh(clp, hdr->cred,
+ hdr->args.fh, hdr->args.context->mode);
+
if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion)
task_flags = RPC_TASK_MOVEABLE;
ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
@@ -961,7 +972,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
NFS_PROTO(hdr->inode),
desc->pg_rpc_callops,
desc->pg_ioflags,
- RPC_TASK_CRED_NOREF | task_flags);
+ RPC_TASK_CRED_NOREF | task_flags,
+ localio);
}
return ret;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index a74ee69a2fa6..dbef837e871a 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -490,7 +490,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how,
- RPC_TASK_CRED_NOREF);
+ RPC_TASK_CRED_NOREF, NULL);
} else {
nfs_init_commit(data, NULL, data->lseg, cinfo);
initiate_commit(data, how);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a6103333b666..81bd1b9aba17 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -48,8 +48,7 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void)
static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
{
- if (rhdr->res.scratch != NULL)
- kfree(rhdr->res.scratch);
+ kfree(rhdr->res.scratch);
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 97b386032b71..9723b6c53397 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -551,6 +551,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
else
seq_puts(m, ",local_lock=posix");
+ if (nfss->flags & NFS_MOUNT_NO_ALIGNWRITE)
+ seq_puts(m, ",noalignwrite");
+
if (nfss->flags & NFS_MOUNT_WRITE_EAGER) {
if (nfss->flags & NFS_MOUNT_WRITE_WAIT)
seq_puts(m, ",write=wait");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d074d0ceb4f0..ead2dc55952d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -772,8 +772,7 @@ static void nfs_inode_add_request(struct nfs_page *req)
nfs_lock_request(req);
spin_lock(&mapping->i_private_lock);
set_bit(PG_MAPPED, &req->wb_flags);
- folio_set_private(folio);
- folio->private = req;
+ folio_attach_private(folio, req);
spin_unlock(&mapping->i_private_lock);
atomic_long_inc(&nfsi->nrequests);
/* this a head request for a page group - mark it as having an
@@ -797,8 +796,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_lock(&mapping->i_private_lock);
if (likely(folio)) {
- folio->private = NULL;
- folio_clear_private(folio);
+ folio_detach_private(folio);
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
spin_unlock(&mapping->i_private_lock);
@@ -1297,7 +1295,10 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
struct file_lock_context *flctx = locks_inode_context(inode);
struct file_lock *fl;
int ret;
+ unsigned int mntflags = NFS_SERVER(inode)->flags;
+ if (mntflags & NFS_MOUNT_NO_ALIGNWRITE)
+ return 0;
if (file->f_flags & O_DSYNC)
return 0;
if (!nfs_folio_write_uptodate(folio, pagelen))
@@ -1663,7 +1664,8 @@ EXPORT_SYMBOL_GPL(nfs_commitdata_release);
int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
- int how, int flags)
+ int how, int flags,
+ struct nfsd_file *localio)
{
struct rpc_task *task;
int priority = flush_task_priority(how);
@@ -1692,6 +1694,9 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
dprintk("NFS: initiated commit call\n");
+ if (localio)
+ return nfs_local_commit(localio, data, call_ops, how);
+
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -1791,6 +1796,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
struct nfs_commit_info *cinfo)
{
struct nfs_commit_data *data;
+ struct nfsd_file *localio;
unsigned short task_flags = 0;
/* another commit raced with us */
@@ -1807,9 +1813,12 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
nfs_init_commit(data, head, NULL, cinfo);
if (NFS_SERVER(inode)->nfs_client->cl_minorversion)
task_flags = RPC_TASK_MOVEABLE;
+
+ localio = nfs_local_open_fh(NFS_SERVER(inode)->nfs_client, data->cred,
+ data->args.fh, data->context->mode);
return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
data->mds_ops, how,
- RPC_TASK_CRED_NOREF | task_flags);
+ RPC_TASK_CRED_NOREF | task_flags, localio);
}
/*
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index 119c75ab9fd0..a5e54809701e 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -6,5 +6,10 @@
obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
nfs_acl-objs := nfsacl.o
+obj-$(CONFIG_NFS_COMMON_LOCALIO_SUPPORT) += nfs_localio.o
+nfs_localio-objs := nfslocalio.o
+
obj-$(CONFIG_GRACE_PERIOD) += grace.o
obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o
+
+obj-$(CONFIG_NFS_COMMON) += common.o
diff --git a/fs/nfs_common/common.c b/fs/nfs_common/common.c
new file mode 100644
index 000000000000..34a115176f97
--- /dev/null
+++ b/fs/nfs_common/common.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/module.h>
+#include <linux/nfs_common.h>
+#include <linux/nfs4.h>
+
+/*
+ * We need to translate between nfs status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs_errtbl[] = {
+ { NFS_OK, 0 },
+ { NFSERR_PERM, -EPERM },
+ { NFSERR_NOENT, -ENOENT },
+ { NFSERR_IO, -errno_NFSERR_IO},
+ { NFSERR_NXIO, -ENXIO },
+/* { NFSERR_EAGAIN, -EAGAIN }, */
+ { NFSERR_ACCES, -EACCES },
+ { NFSERR_EXIST, -EEXIST },
+ { NFSERR_XDEV, -EXDEV },
+ { NFSERR_NODEV, -ENODEV },
+ { NFSERR_NOTDIR, -ENOTDIR },
+ { NFSERR_ISDIR, -EISDIR },
+ { NFSERR_INVAL, -EINVAL },
+ { NFSERR_FBIG, -EFBIG },
+ { NFSERR_NOSPC, -ENOSPC },
+ { NFSERR_ROFS, -EROFS },
+ { NFSERR_MLINK, -EMLINK },
+ { NFSERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFSERR_NOTEMPTY, -ENOTEMPTY },
+ { NFSERR_DQUOT, -EDQUOT },
+ { NFSERR_STALE, -ESTALE },
+ { NFSERR_REMOTE, -EREMOTE },
+#ifdef EWFLUSH
+ { NFSERR_WFLUSH, -EWFLUSH },
+#endif
+ { NFSERR_BADHANDLE, -EBADHANDLE },
+ { NFSERR_NOT_SYNC, -ENOTSYNC },
+ { NFSERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFSERR_NOTSUPP, -ENOTSUPP },
+ { NFSERR_TOOSMALL, -ETOOSMALL },
+ { NFSERR_SERVERFAULT, -EREMOTEIO },
+ { NFSERR_BADTYPE, -EBADTYPE },
+ { NFSERR_JUKEBOX, -EJUKEBOX },
+ { -1, -EIO }
+};
+
+/**
+ * nfs_stat_to_errno - convert an NFS status code to a local errno
+ * @status: NFS status code to convert
+ *
+ * Returns a local errno value, or -EIO if the NFS status code is
+ * not recognized. This function is used jointly by NFSv2 and NFSv3.
+ */
+int nfs_stat_to_errno(enum nfs_stat status)
+{
+ int i;
+
+ for (i = 0; nfs_errtbl[i].stat != -1; i++) {
+ if (nfs_errtbl[i].stat == (int)status)
+ return nfs_errtbl[i].errno;
+ }
+ return nfs_errtbl[i].errno;
+}
+EXPORT_SYMBOL_GPL(nfs_stat_to_errno);
+
+/*
+ * We need to translate between nfs v4 status return values and
+ * the local errno values which may not be the same.
+ */
+static const struct {
+ int stat;
+ int errno;
+} nfs4_errtbl[] = {
+ { NFS4_OK, 0 },
+ { NFS4ERR_PERM, -EPERM },
+ { NFS4ERR_NOENT, -ENOENT },
+ { NFS4ERR_IO, -errno_NFSERR_IO},
+ { NFS4ERR_NXIO, -ENXIO },
+ { NFS4ERR_ACCESS, -EACCES },
+ { NFS4ERR_EXIST, -EEXIST },
+ { NFS4ERR_XDEV, -EXDEV },
+ { NFS4ERR_NOTDIR, -ENOTDIR },
+ { NFS4ERR_ISDIR, -EISDIR },
+ { NFS4ERR_INVAL, -EINVAL },
+ { NFS4ERR_FBIG, -EFBIG },
+ { NFS4ERR_NOSPC, -ENOSPC },
+ { NFS4ERR_ROFS, -EROFS },
+ { NFS4ERR_MLINK, -EMLINK },
+ { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG },
+ { NFS4ERR_NOTEMPTY, -ENOTEMPTY },
+ { NFS4ERR_DQUOT, -EDQUOT },
+ { NFS4ERR_STALE, -ESTALE },
+ { NFS4ERR_BADHANDLE, -EBADHANDLE },
+ { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
+ { NFS4ERR_NOTSUPP, -ENOTSUPP },
+ { NFS4ERR_TOOSMALL, -ETOOSMALL },
+ { NFS4ERR_SERVERFAULT, -EREMOTEIO },
+ { NFS4ERR_BADTYPE, -EBADTYPE },
+ { NFS4ERR_LOCKED, -EAGAIN },
+ { NFS4ERR_SYMLINK, -ELOOP },
+ { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
+ { NFS4ERR_DEADLOCK, -EDEADLK },
+ { NFS4ERR_NOXATTR, -ENODATA },
+ { NFS4ERR_XATTR2BIG, -E2BIG },
+ { -1, -EIO }
+};
+
+/*
+ * Convert an NFS error code to a local one.
+ * This one is used by NFSv4.
+ */
+int nfs4_stat_to_errno(int stat)
+{
+ int i;
+ for (i = 0; nfs4_errtbl[i].stat != -1; i++) {
+ if (nfs4_errtbl[i].stat == stat)
+ return nfs4_errtbl[i].errno;
+ }
+ if (stat <= 10000 || stat > 10100) {
+ /* The server is looney tunes. */
+ return -EREMOTEIO;
+ }
+ /* If we cannot translate the error, the recovery routines should
+ * handle it.
+ * Note: remaining NFSv4 error codes have values > 10000, so should
+ * not conflict with native Linux error codes.
+ */
+ return -stat;
+}
+EXPORT_SYMBOL_GPL(nfs4_stat_to_errno);
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
new file mode 100644
index 000000000000..42b479b9191f
--- /dev/null
+++ b/fs/nfs_common/nfslocalio.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ * Copyright (C) 2024 NeilBrown <neilb@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/nfslocalio.h>
+#include <net/netns/generic.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NFS localio protocol bypass support");
+
+static DEFINE_SPINLOCK(nfs_uuid_lock);
+
+/*
+ * Global list of nfs_uuid_t instances
+ * that is protected by nfs_uuid_lock.
+ */
+static LIST_HEAD(nfs_uuids);
+
+void nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+{
+ nfs_uuid->net = NULL;
+ nfs_uuid->dom = NULL;
+ uuid_gen(&nfs_uuid->uuid);
+
+ spin_lock(&nfs_uuid_lock);
+ list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids);
+ spin_unlock(&nfs_uuid_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_begin);
+
+void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
+{
+ if (nfs_uuid->net == NULL) {
+ spin_lock(&nfs_uuid_lock);
+ list_del_init(&nfs_uuid->list);
+ spin_unlock(&nfs_uuid_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_end);
+
+static nfs_uuid_t * nfs_uuid_lookup_locked(const uuid_t *uuid)
+{
+ nfs_uuid_t *nfs_uuid;
+
+ list_for_each_entry(nfs_uuid, &nfs_uuids, list)
+ if (uuid_equal(&nfs_uuid->uuid, uuid))
+ return nfs_uuid;
+
+ return NULL;
+}
+
+static struct module *nfsd_mod;
+
+void nfs_uuid_is_local(const uuid_t *uuid, struct list_head *list,
+ struct net *net, struct auth_domain *dom,
+ struct module *mod)
+{
+ nfs_uuid_t *nfs_uuid;
+
+ spin_lock(&nfs_uuid_lock);
+ nfs_uuid = nfs_uuid_lookup_locked(uuid);
+ if (nfs_uuid) {
+ kref_get(&dom->ref);
+ nfs_uuid->dom = dom;
+ /*
+ * We don't hold a ref on the net, but instead put
+ * ourselves on a list so the net pointer can be
+ * invalidated.
+ */
+ list_move(&nfs_uuid->list, list);
+ rcu_assign_pointer(nfs_uuid->net, net);
+
+ __module_get(mod);
+ nfsd_mod = mod;
+ }
+ spin_unlock(&nfs_uuid_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_is_local);
+
+static void nfs_uuid_put_locked(nfs_uuid_t *nfs_uuid)
+{
+ if (nfs_uuid->net) {
+ module_put(nfsd_mod);
+ nfs_uuid->net = NULL;
+ }
+ if (nfs_uuid->dom) {
+ auth_domain_put(nfs_uuid->dom);
+ nfs_uuid->dom = NULL;
+ }
+ list_del_init(&nfs_uuid->list);
+}
+
+void nfs_uuid_invalidate_clients(struct list_head *list)
+{
+ nfs_uuid_t *nfs_uuid, *tmp;
+
+ spin_lock(&nfs_uuid_lock);
+ list_for_each_entry_safe(nfs_uuid, tmp, list, list)
+ nfs_uuid_put_locked(nfs_uuid);
+ spin_unlock(&nfs_uuid_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_clients);
+
+void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid)
+{
+ if (nfs_uuid->net) {
+ spin_lock(&nfs_uuid_lock);
+ nfs_uuid_put_locked(nfs_uuid);
+ spin_unlock(&nfs_uuid_lock);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_invalidate_one_client);
+
+struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
+ struct rpc_clnt *rpc_clnt, const struct cred *cred,
+ const struct nfs_fh *nfs_fh, const fmode_t fmode)
+{
+ struct net *net;
+ struct nfsd_file *localio;
+
+ /*
+ * Not running in nfsd context, so must safely get reference on nfsd_serv.
+ * But the server may already be shutting down, if so disallow new localio.
+ * uuid->net is NOT a counted reference, but rcu_read_lock() ensures that
+ * if uuid->net is not NULL, then calling nfsd_serv_try_get() is safe
+ * and if it succeeds we will have an implied reference to the net.
+ *
+ * Otherwise NFS may not have ref on NFSD and therefore cannot safely
+ * make 'nfs_to' calls.
+ */
+ rcu_read_lock();
+ net = rcu_dereference(uuid->net);
+ if (!net || !nfs_to->nfsd_serv_try_get(net)) {
+ rcu_read_unlock();
+ return ERR_PTR(-ENXIO);
+ }
+ rcu_read_unlock();
+ /* We have an implied reference to net thanks to nfsd_serv_try_get */
+ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
+ cred, nfs_fh, fmode);
+ if (IS_ERR(localio))
+ nfs_to->nfsd_serv_put(net);
+ return localio;
+}
+EXPORT_SYMBOL_GPL(nfs_open_local_fh);
+
+/*
+ * The NFS LOCALIO code needs to call into NFSD using various symbols,
+ * but cannot be statically linked, because that will make the NFS
+ * module always depend on the NFSD module.
+ *
+ * 'nfs_to' provides NFS access to NFSD functions needed for LOCALIO,
+ * its lifetime is tightly coupled to the NFSD module and will always
+ * be available to NFS LOCALIO because any successful client<->server
+ * LOCALIO handshake results in a reference on the NFSD module (above),
+ * so NFS implicitly holds a reference to the NFSD module and its
+ * functions in the 'nfs_to' nfsd_localio_operations cannot disappear.
+ *
+ * If the last NFS client using LOCALIO disconnects (and its reference
+ * on NFSD dropped) then NFSD could be unloaded, resulting in 'nfs_to'
+ * functions being invalid pointers. But if NFSD isn't loaded then NFS
+ * will not be able to handshake with NFSD and will have no cause to
+ * try to call 'nfs_to' function pointers. If/when NFSD is reloaded it
+ * will reinitialize the 'nfs_to' function pointers and make LOCALIO
+ * possible.
+ */
+const struct nfsd_localio_operations *nfs_to;
+EXPORT_SYMBOL_GPL(nfs_to);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index ec2ab6429e00..c0bd1509ccd4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -7,6 +7,7 @@ config NFSD
select LOCKD
select SUNRPC
select EXPORTFS
+ select NFS_COMMON
select NFS_ACL_SUPPORT if NFSD_V2_ACL
select NFS_ACL_SUPPORT if NFSD_V3_ACL
depends on MULTIUSER
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index b8736a82e57c..18cbd3fa7691 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -23,3 +23,4 @@ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
+nfsd-$(CONFIG_NFS_LOCALIO) += localio.o
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index e6beaaf4f170..93e33d1ee891 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -5,26 +5,26 @@
#include "nfsd.h"
#include "auth.h"
-int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
+int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp)
{
struct exp_flavor_info *f;
struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
for (f = exp->ex_flavors; f < end; f++) {
- if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
+ if (f->pseudoflavor == cred->cr_flavor)
return f->flags;
}
return exp->ex_flags;
}
-int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
+int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
{
struct group_info *rqgi;
struct group_info *gi;
struct cred *new;
int i;
- int flags = nfsexp_flags(rqstp, exp);
+ int flags = nfsexp_flags(cred, exp);
/* discard any old override before preparing the new set */
revert_creds(get_cred(current_real_cred()));
@@ -32,10 +32,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
if (!new)
return -ENOMEM;
- new->fsuid = rqstp->rq_cred.cr_uid;
- new->fsgid = rqstp->rq_cred.cr_gid;
+ new->fsuid = cred->cr_uid;
+ new->fsgid = cred->cr_gid;
- rqgi = rqstp->rq_cred.cr_group_info;
+ rqgi = cred->cr_group_info;
if (flags & NFSEXP_ALLSQUASH) {
new->fsuid = exp->ex_anon_uid;
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
index dbd66424f600..8c5031bbbcee 100644
--- a/fs/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -12,6 +12,6 @@
* Set the current process's fsuid/fsgid etc to those of the NFS
* client user
*/
-int nfsd_setuser(struct svc_rqst *, struct svc_export *);
+int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp);
#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 3c040c81c77d..08a20e5bcf7f 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -147,8 +147,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb,
struct pnfs_block_deviceaddr *dev;
struct pnfs_block_volume *b;
- dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
- sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ dev = kzalloc(struct_size(dev, volumes, 1), GFP_KERNEL);
if (!dev)
return -ENOMEM;
gdp->gd_device = dev;
@@ -255,8 +254,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
const struct pr_ops *ops;
int ret;
- dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
- sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ dev = kzalloc(struct_size(dev, volumes, 1), GFP_KERNEL);
if (!dev)
return -ENOMEM;
gdp->gd_device = dev;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index b0361e8aa9a7..4e28ac8f1127 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -47,7 +47,7 @@ struct pnfs_block_volume {
struct pnfs_block_deviceaddr {
u32 nr_volumes;
- struct pnfs_block_volume volumes[];
+ struct pnfs_block_volume volumes[] __counted_by(nr_volumes);
};
__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 66a05fefae98..bb7addef4a31 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -10,7 +10,7 @@
#define NFSCACHE_H
#include <linux/sunrpc/svc.h>
-#include "netns.h"
+#include "nfsd.h"
/*
* Representation of a reply cache entry.
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 50b3135d07ac..c82d8e3e0d4f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1074,10 +1074,30 @@ static struct svc_export *exp_find(struct cache_detail *cd,
return exp;
}
+/**
+ * check_nfsd_access - check if access to export is allowed.
+ * @exp: svc_export that is being accessed.
+ * @rqstp: svc_rqst attempting to access @exp (will be NULL for LOCALIO).
+ *
+ * Return values:
+ * %nfs_ok if access is granted, or
+ * %nfserr_wrongsec if access is denied
+ */
__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
{
struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors;
- struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_xprt *xprt;
+
+ /*
+ * If rqstp is NULL, this is a LOCALIO request which will only
+ * ever use a filehandle/credential pair for which access has
+ * been affirmed (by ACCESS or OPEN NFS requests) over the
+ * wire. So there is no need for further checks here.
+ */
+ if (!rqstp)
+ return nfs_ok;
+
+ xprt = rqstp->rq_xprt;
if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) {
if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags))
@@ -1098,17 +1118,17 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
ok:
/* legacy gss-only clients are always OK: */
if (exp->ex_client == rqstp->rq_gssclient)
- return 0;
+ return nfs_ok;
/* ip-address based client; check sec= export option: */
for (f = exp->ex_flavors; f < end; f++) {
if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
- return 0;
+ return nfs_ok;
}
/* defaults in absence of sec= options: */
if (exp->ex_nflavors == 0) {
if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
- return 0;
+ return nfs_ok;
}
/* If the compound op contains a spo_must_allowed op,
@@ -1118,10 +1138,10 @@ ok:
*/
if (nfsd4_spo_must_allow(rqstp))
- return 0;
+ return nfs_ok;
denied:
- return rqstp->rq_vers < 4 ? nfserr_acces : nfserr_wrongsec;
+ return nfserr_wrongsec;
}
/*
@@ -1164,19 +1184,35 @@ gss:
return gssexp;
}
+/**
+ * rqst_exp_find - Find an svc_export in the context of a rqst or similar
+ * @reqp: The handle to be used to suspend the request if a cache-upcall is needed
+ * If NULL, missing in-cache information will result in failure.
+ * @net: The network namespace in which the request exists
+ * @cl: default auth_domain to use for looking up the export
+ * @gsscl: an alternate auth_domain defined using deprecated gss/krb5 format.
+ * @fsid_type: The type of fsid to look for
+ * @fsidv: The actual fsid to look up in the context of either client.
+ *
+ * Perform a lookup for @cl/@fsidv in the given @net for an export. If
+ * none found and @gsscl specified, repeat the lookup.
+ *
+ * Returns an export, or an error pointer.
+ */
struct svc_export *
-rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
+rqst_exp_find(struct cache_req *reqp, struct net *net,
+ struct auth_domain *cl, struct auth_domain *gsscl,
+ int fsid_type, u32 *fsidv)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct cache_detail *cd = nn->svc_export_cache;
- if (rqstp->rq_client == NULL)
+ if (!cl)
goto gss;
/* First try the auth_unix client: */
- exp = exp_find(cd, rqstp->rq_client, fsid_type,
- fsidv, &rqstp->rq_chandle);
+ exp = exp_find(cd, cl, fsid_type, fsidv, reqp);
if (PTR_ERR(exp) == -ENOENT)
goto gss;
if (IS_ERR(exp))
@@ -1186,10 +1222,9 @@ rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
return exp;
gss:
/* Otherwise, try falling back on gss client */
- if (rqstp->rq_gssclient == NULL)
+ if (!gsscl)
return exp;
- gssexp = exp_find(cd, rqstp->rq_gssclient, fsid_type, fsidv,
- &rqstp->rq_chandle);
+ gssexp = exp_find(cd, gsscl, fsid_type, fsidv, reqp);
if (PTR_ERR(gssexp) == -ENOENT)
return exp;
if (!IS_ERR(exp))
@@ -1220,7 +1255,9 @@ struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp)
mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
- return rqst_exp_find(rqstp, FSID_NUM, fsidv);
+ return rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp),
+ rqstp->rq_client, rqstp->rq_gssclient,
+ FSID_NUM, fsidv);
}
/*
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index ca9dc230ae3d..3794ae253a70 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -99,7 +99,8 @@ struct svc_expkey {
#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE)
#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
-int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+struct svc_cred;
+int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp);
__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
/*
@@ -127,6 +128,8 @@ static inline struct svc_export *exp_get(struct svc_export *exp)
cache_get(&exp->h);
return exp;
}
-struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+struct svc_export *rqst_exp_find(struct cache_req *reqp, struct net *net,
+ struct auth_domain *cl, struct auth_domain *gsscl,
+ int fsid_type, u32 *fsidv);
#endif /* NFSD_EXPORT_H */
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f4704f5d4086..19bb88c7eebd 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -52,10 +52,11 @@
#define NFSD_FILE_CACHE_UP (0)
/* We only care about NFSD_MAY_READ/WRITE for this cache */
-#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
+#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE|NFSD_MAY_LOCALIO)
static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
+static DEFINE_PER_CPU(unsigned long, nfsd_file_allocations);
static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
@@ -111,7 +112,7 @@ static void
nfsd_file_schedule_laundrette(void)
{
if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
- queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+ queue_delayed_work(system_unbound_wq, &nfsd_filecache_laundrette,
NFSD_LAUNDRETTE_DELAY);
}
@@ -151,7 +152,7 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
}
static struct nfsd_file_mark *
-nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
+nfsd_file_mark_find_or_create(struct inode *inode)
{
int err;
struct fsnotify_mark *mark;
@@ -215,7 +216,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
if (unlikely(!nf))
return NULL;
+ this_cpu_inc(nfsd_file_allocations);
INIT_LIST_HEAD(&nf->nf_lru);
+ INIT_LIST_HEAD(&nf->nf_gc);
nf->nf_birthtime = ktime_get();
nf->nf_file = NULL;
nf->nf_cred = get_current_cred();
@@ -387,14 +390,42 @@ nfsd_file_put(struct nfsd_file *nf)
nfsd_file_free(nf);
}
+/**
+ * nfsd_file_put_local - put the reference to nfsd_file and local nfsd_serv
+ * @nf: nfsd_file of which to put the references
+ *
+ * First put the reference of the nfsd_file and then put the
+ * reference to the associated nn->nfsd_serv.
+ */
+void
+nfsd_file_put_local(struct nfsd_file *nf)
+{
+ struct net *net = nf->nf_net;
+
+ nfsd_file_put(nf);
+ nfsd_serv_put(net);
+}
+
+/**
+ * nfsd_file_file - get the backing file of an nfsd_file
+ * @nf: nfsd_file of which to access the backing file.
+ *
+ * Return backing file for @nf.
+ */
+struct file *
+nfsd_file_file(struct nfsd_file *nf)
+{
+ return nf->nf_file;
+}
+
static void
nfsd_file_dispose_list(struct list_head *dispose)
{
struct nfsd_file *nf;
while (!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
+ nf = list_first_entry(dispose, struct nfsd_file, nf_gc);
+ list_del_init(&nf->nf_gc);
nfsd_file_free(nf);
}
}
@@ -411,12 +442,12 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
{
while(!list_empty(dispose)) {
struct nfsd_file *nf = list_first_entry(dispose,
- struct nfsd_file, nf_lru);
+ struct nfsd_file, nf_gc);
struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
struct nfsd_fcache_disposal *l = nn->fcache_disposal;
spin_lock(&l->lock);
- list_move_tail(&nf->nf_lru, &l->freeme);
+ list_move_tail(&nf->nf_gc, &l->freeme);
spin_unlock(&l->lock);
svc_wake_up(nn->nfsd_serv);
}
@@ -503,7 +534,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
/* Refcount went to zero. Unhash it and queue it to the dispose list */
nfsd_file_unhash(nf);
- list_lru_isolate_move(lru, &nf->nf_lru, head);
+ list_lru_isolate(lru, &nf->nf_lru);
+ list_add(&nf->nf_gc, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
@@ -578,7 +610,7 @@ nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
/* If refcount goes to 0, then put on the dispose list */
if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
- list_add(&nf->nf_lru, dispose);
+ list_add(&nf->nf_gc, dispose);
trace_nfsd_file_closing(nf);
}
}
@@ -654,8 +686,8 @@ nfsd_file_close_inode_sync(struct inode *inode)
nfsd_file_queue_for_close(inode, &dispose);
while (!list_empty(&dispose)) {
- nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_gc);
+ list_del_init(&nf->nf_gc);
nfsd_file_free(nf);
}
}
@@ -909,6 +941,7 @@ nfsd_file_cache_shutdown(void)
for_each_possible_cpu(i) {
per_cpu(nfsd_file_cache_hits, i) = 0;
per_cpu(nfsd_file_acquisitions, i) = 0;
+ per_cpu(nfsd_file_allocations, i) = 0;
per_cpu(nfsd_file_releases, i) = 0;
per_cpu(nfsd_file_total_age, i) = 0;
per_cpu(nfsd_file_evictions, i) = 0;
@@ -977,12 +1010,14 @@ nfsd_file_is_cached(struct inode *inode)
}
static __be32
-nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
+ struct svc_cred *cred,
+ struct auth_domain *client,
+ struct svc_fh *fhp,
unsigned int may_flags, struct file *file,
struct nfsd_file **pnf, bool want_gc)
{
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
- struct net *net = SVC_NET(rqstp);
struct nfsd_file *new, *nf;
bool stale_retry = true;
bool open_retry = true;
@@ -991,8 +1026,13 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
int ret;
retry:
- status = fh_verify(rqstp, fhp, S_IFREG,
- may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ if (rqstp) {
+ status = fh_verify(rqstp, fhp, S_IFREG,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ } else {
+ status = fh_verify_local(net, cred, client, fhp, S_IFREG,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ }
if (status != nfs_ok)
return status;
inode = d_inode(fhp->fh_dentry);
@@ -1024,7 +1064,7 @@ retry:
if (unlikely(nf)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
- nfsd_file_slab_free(&new->nf_rcu);
+ nfsd_file_free(new);
goto wait_for_construction;
}
nf = new;
@@ -1035,8 +1075,6 @@ retry:
if (likely(ret == 0))
goto open_file;
- if (ret == -EEXIST)
- goto retry;
trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret);
status = nfserr_jukebox;
goto construction_err;
@@ -1051,6 +1089,7 @@ wait_for_construction:
status = nfserr_jukebox;
goto construction_err;
}
+ nfsd_file_put(nf);
open_retry = false;
fh_put(fhp);
goto retry;
@@ -1074,7 +1113,7 @@ out:
open_file:
trace_nfsd_file_alloc(nf);
- nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode);
+ nf->nf_mark = nfsd_file_mark_find_or_create(inode);
if (nf->nf_mark) {
if (file) {
get_file(file);
@@ -1139,7 +1178,8 @@ __be32
nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, true);
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, NULL, pnf, true);
}
/**
@@ -1163,7 +1203,55 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, false);
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, NULL, pnf, false);
+}
+
+/**
+ * nfsd_file_acquire_local - Get a struct nfsd_file with an open file for localio
+ * @net: The network namespace in which to perform a lookup
+ * @cred: the user credential with which to validate access
+ * @client: the auth_domain for LOCALIO lookup
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * This file lookup interface provide access to a file given the
+ * filehandle and credential. No connection-based authorisation
+ * is performed and in that way it is quite different to other
+ * file access mediated by nfsd. It allows a kernel module such as the NFS
+ * client to reach across network and filesystem namespaces to access
+ * a file. The security implications of this should be carefully
+ * considered before use.
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
+ struct auth_domain *client, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ /*
+ * Save creds before calling nfsd_file_do_acquire() (which calls
+ * nfsd_setuser). Important because caller (LOCALIO) is from
+ * client context.
+ */
+ const struct cred *save_cred = get_current_cred();
+ __be32 beres;
+
+ beres = nfsd_file_do_acquire(NULL, net, cred, client,
+ fhp, may_flags, NULL, pnf, true);
+ revert_creds(save_cred);
+ return beres;
}
/**
@@ -1189,7 +1277,8 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct file *file,
struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, file, pnf, false);
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
+ fhp, may_flags, file, pnf, false);
}
/*
@@ -1199,7 +1288,7 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned long releases = 0, evictions = 0;
+ unsigned long allocations = 0, releases = 0, evictions = 0;
unsigned long hits = 0, acquisitions = 0;
unsigned int i, count = 0, buckets = 0;
unsigned long lru = 0, total_age = 0;
@@ -1224,6 +1313,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
for_each_possible_cpu(i) {
hits += per_cpu(nfsd_file_cache_hits, i);
acquisitions += per_cpu(nfsd_file_acquisitions, i);
+ allocations += per_cpu(nfsd_file_allocations, i);
releases += per_cpu(nfsd_file_releases, i);
total_age += per_cpu(nfsd_file_total_age, i);
evictions += per_cpu(nfsd_file_evictions, i);
@@ -1234,6 +1324,7 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "lru entries: %lu\n", lru);
seq_printf(m, "cache hits: %lu\n", hits);
seq_printf(m, "acquisitions: %lu\n", acquisitions);
+ seq_printf(m, "allocations: %lu\n", allocations);
seq_printf(m, "releases: %lu\n", releases);
seq_printf(m, "evictions: %lu\n", evictions);
if (releases)
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index c61884def906..cadf3c2689c4 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -44,6 +44,7 @@ struct nfsd_file {
struct nfsd_file_mark *nf_mark;
struct list_head nf_lru;
+ struct list_head nf_gc;
struct rcu_head nf_rcu;
ktime_t nf_birthtime;
};
@@ -54,7 +55,9 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
+void nfsd_file_put_local(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+struct file *nfsd_file_file(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
void nfsd_file_net_dispose(struct nfsd_net *nn);
bool nfsd_file_is_cached(struct inode *inode);
@@ -65,5 +68,8 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct file *file,
struct nfsd_file **nfp);
+__be32 nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
+ struct auth_domain *client, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf);
int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
new file mode 100644
index 000000000000..291e9c69cae4
--- /dev/null
+++ b/fs/nfsd/localio.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NFS server support for local clients to bypass network stack
+ *
+ * Copyright (C) 2014 Weston Andros Adamson <dros@primarydata.com>
+ * Copyright (C) 2019 Trond Myklebust <trond.myklebust@hammerspace.com>
+ * Copyright (C) 2024 Mike Snitzer <snitzer@hammerspace.com>
+ * Copyright (C) 2024 NeilBrown <neilb@suse.de>
+ */
+
+#include <linux/exportfs.h>
+#include <linux/sunrpc/svcauth.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/nfs.h>
+#include <linux/nfs_common.h>
+#include <linux/nfslocalio.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_xdr.h>
+#include <linux/string.h>
+
+#include "nfsd.h"
+#include "vfs.h"
+#include "netns.h"
+#include "filecache.h"
+#include "cache.h"
+
+static const struct nfsd_localio_operations nfsd_localio_ops = {
+ .nfsd_serv_try_get = nfsd_serv_try_get,
+ .nfsd_serv_put = nfsd_serv_put,
+ .nfsd_open_local_fh = nfsd_open_local_fh,
+ .nfsd_file_put_local = nfsd_file_put_local,
+ .nfsd_file_file = nfsd_file_file,
+};
+
+void nfsd_localio_ops_init(void)
+{
+ nfs_to = &nfsd_localio_ops;
+}
+
+/**
+ * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file
+ *
+ * @net: 'struct net' to get the proper nfsd_net required for LOCALIO access
+ * @dom: 'struct auth_domain' required for LOCALIO access
+ * @rpc_clnt: rpc_clnt that the client established
+ * @cred: cred that the client established
+ * @nfs_fh: filehandle to lookup
+ * @fmode: fmode_t to use for open
+ *
+ * This function maps a local fh to a path on a local filesystem.
+ * This is useful when the nfs client has the local server mounted - it can
+ * avoid all the NFS overhead with reads, writes and commits.
+ *
+ * On successful return, returned nfsd_file will have its nf_net member
+ * set. Caller (NFS client) is responsible for calling nfsd_serv_put and
+ * nfsd_file_put (via nfs_to->nfsd_file_put_local).
+ */
+struct nfsd_file *
+nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
+ struct rpc_clnt *rpc_clnt, const struct cred *cred,
+ const struct nfs_fh *nfs_fh, const fmode_t fmode)
+{
+ int mayflags = NFSD_MAY_LOCALIO;
+ struct svc_cred rq_cred;
+ struct svc_fh fh;
+ struct nfsd_file *localio;
+ __be32 beres;
+
+ if (nfs_fh->size > NFS4_FHSIZE)
+ return ERR_PTR(-EINVAL);
+
+ /* nfs_fh -> svc_fh */
+ fh_init(&fh, NFS4_FHSIZE);
+ fh.fh_handle.fh_size = nfs_fh->size;
+ memcpy(fh.fh_handle.fh_raw, nfs_fh->data, nfs_fh->size);
+
+ if (fmode & FMODE_READ)
+ mayflags |= NFSD_MAY_READ;
+ if (fmode & FMODE_WRITE)
+ mayflags |= NFSD_MAY_WRITE;
+
+ svcauth_map_clnt_to_svc_cred_local(rpc_clnt, cred, &rq_cred);
+
+ beres = nfsd_file_acquire_local(net, &rq_cred, dom,
+ &fh, mayflags, &localio);
+ if (beres)
+ localio = ERR_PTR(nfs_stat_to_errno(be32_to_cpu(beres)));
+
+ fh_put(&fh);
+ if (rq_cred.cr_group_info)
+ put_group_info(rq_cred.cr_group_info);
+
+ return localio;
+}
+EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
+
+/*
+ * UUID_IS_LOCAL XDR functions
+ */
+
+static __be32 localio_proc_null(struct svc_rqst *rqstp)
+{
+ return rpc_success;
+}
+
+struct localio_uuidarg {
+ uuid_t uuid;
+};
+
+static __be32 localio_proc_uuid_is_local(struct svc_rqst *rqstp)
+{
+ struct localio_uuidarg *argp = rqstp->rq_argp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs_uuid_is_local(&argp->uuid, &nn->local_clients,
+ net, rqstp->rq_client, THIS_MODULE);
+
+ return rpc_success;
+}
+
+static bool localio_decode_uuidarg(struct svc_rqst *rqstp,
+ struct xdr_stream *xdr)
+{
+ struct localio_uuidarg *argp = rqstp->rq_argp;
+ u8 uuid[UUID_SIZE];
+
+ if (decode_opaque_fixed(xdr, uuid, UUID_SIZE))
+ return false;
+ import_uuid(&argp->uuid, uuid);
+
+ return true;
+}
+
+static const struct svc_procedure localio_procedures1[] = {
+ [LOCALIOPROC_NULL] = {
+ .pc_func = localio_proc_null,
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
+ .pc_name = "NULL",
+ },
+ [LOCALIOPROC_UUID_IS_LOCAL] = {
+ .pc_func = localio_proc_uuid_is_local,
+ .pc_decode = localio_decode_uuidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct localio_uuidarg),
+ .pc_argzero = sizeof(struct localio_uuidarg),
+ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_name = "UUID_IS_LOCAL",
+ },
+};
+
+#define LOCALIO_NR_PROCEDURES ARRAY_SIZE(localio_procedures1)
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ localio_count[LOCALIO_NR_PROCEDURES]);
+const struct svc_version localio_version1 = {
+ .vs_vers = 1,
+ .vs_nproc = LOCALIO_NR_PROCEDURES,
+ .vs_proc = localio_procedures1,
+ .vs_dispatch = nfsd_dispatch,
+ .vs_count = localio_count,
+ .vs_xdrsize = XDR_QUADLEN(UUID_SIZE),
+ .vs_hidden = true,
+};
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 14ec15656320..26f7b34d1a03 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -13,6 +13,7 @@
#include <linux/filelock.h>
#include <linux/nfs4.h>
#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
#include <linux/siphash.h>
#include <linux/sunrpc/stats.h>
@@ -139,7 +140,9 @@ struct nfsd_net {
struct svc_info nfsd_info;
#define nfsd_serv nfsd_info.serv
-
+ struct percpu_ref nfsd_serv_ref;
+ struct completion nfsd_serv_confirm_done;
+ struct completion nfsd_serv_free_done;
/*
* clientid and stateid data for construction of net unique COPY
@@ -148,12 +151,13 @@ struct nfsd_net {
u32 s2s_cp_cl_id;
struct idr s2s_cp_stateids;
spinlock_t s2s_cp_lock;
+ atomic_t pending_async_copies;
/*
* Version information
*/
- bool *nfsd_versions;
- bool *nfsd4_minorversions;
+ bool nfsd_versions[NFSD_MAXVERS + 1];
+ bool nfsd4_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1];
/*
* Duplicate reply cache
@@ -213,16 +217,21 @@ struct nfsd_net {
/* last time an admin-revoke happened for NFSv4.0 */
time64_t nfs40_last_revoke;
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ /* Local clients to be invalidated when net is shut down */
+ struct list_head local_clients;
+#endif
};
/* Simple check to find out if a given net was properly initialized */
#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
extern bool nfsd_support_version(int vers);
-extern void nfsd_netns_free_versions(struct nfsd_net *nn);
-
extern unsigned int nfsd_net_id;
+bool nfsd_serv_try_get(struct net *net);
+void nfsd_serv_put(struct net *net);
+
void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn);
void nfsd_reset_write_verifier(struct nfsd_net *nn);
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index dfcc957e460d..372bdcf5e07a 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -28,6 +28,29 @@ static int nfs3_ftypes[] = {
S_IFIFO, /* NF3FIFO */
};
+static __be32 nfsd3_map_status(__be32 status)
+{
+ switch (status) {
+ case nfs_ok:
+ break;
+ case nfserr_nofilehandle:
+ status = nfserr_badhandle;
+ break;
+ case nfserr_wrongsec:
+ case nfserr_file_open:
+ status = nfserr_acces;
+ break;
+ case nfserr_symlink_not_dir:
+ status = nfserr_notdir;
+ break;
+ case nfserr_symlink:
+ case nfserr_wrong_type:
+ status = nfserr_inval;
+ break;
+ }
+ return status;
+}
+
/*
* NULL call.
*/
@@ -57,6 +80,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -80,6 +104,7 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
if (argp->check_guard)
guardtime = &argp->guardtime;
resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -103,6 +128,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp)
resp->status = nfsd_lookup(rqstp, &resp->dirfh,
argp->name, argp->len,
&resp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -122,6 +148,7 @@ nfsd3_proc_access(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->access = argp->access;
resp->status = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -142,6 +169,7 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp)
resp->pages = rqstp->rq_next_page++;
resp->status = nfsd_readlink(rqstp, &resp->fh,
page_address(*resp->pages), &resp->len);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -179,6 +207,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
&resp->count, &resp->eof);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -212,6 +241,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
rqstp->rq_vec, nvecs, &cnt,
resp->committed, resp->verf);
resp->count = cnt;
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -359,6 +389,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -384,6 +415,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
&attrs, S_IFDIR, 0, &resp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -424,6 +456,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
argp->flen, argp->tname, &attrs, &resp->fh);
kfree(argp->tname);
out:
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -465,6 +498,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
&attrs, type, rdev, &resp->fh);
out:
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -486,6 +520,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
argp->name, argp->len);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -506,6 +541,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
argp->name, argp->len);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -528,6 +564,7 @@ nfsd3_proc_rename(struct svc_rqst *rqstp)
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
&resp->tfh, argp->tname, argp->tlen);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -548,6 +585,7 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
&resp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -600,6 +638,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
/* Recycle only pages that were part of the reply */
rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -644,6 +683,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
rqstp->rq_next_page = resp->xdr.page_ptr + 1;
out:
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -661,6 +701,7 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp)
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -704,6 +745,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp)
}
fh_put(&argp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -746,6 +788,7 @@ nfsd3_proc_pathconf(struct svc_rqst *rqstp)
}
fh_put(&argp->fh);
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
@@ -773,6 +816,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
argp->count, resp->verf);
nfsd_file_put(nf);
out:
+ resp->status = nfsd3_map_status(resp->status);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index d756f443fc44..b5b3ab9d719a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1223,6 +1223,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
* cb_seq_status is only set in decode_cb_sequence4res,
* and so will remain 1 if an rpc level failure occurs.
*/
+ trace_nfsd_cb_rpc_prepare(clp);
cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion && !nfsd41_cb_get_slot(cb, task))
@@ -1329,11 +1330,14 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
+ trace_nfsd_cb_rpc_done(clp);
+
if (!nfsd4_cb_sequence_done(task, cb))
return;
if (cb->cb_status) {
- WARN_ON_ONCE(task->tk_status);
+ WARN_ONCE(task->tk_status, "cb_status=%d tk_status=%d",
+ cb->cb_status, task->tk_status);
task->tk_status = cb->cb_status;
}
@@ -1359,6 +1363,8 @@ static void nfsd4_cb_release(void *calldata)
{
struct nfsd4_callback *cb = calldata;
+ trace_nfsd_cb_rpc_release(cb->cb_clp);
+
if (cb->cb_need_restart)
nfsd4_queue_cb(cb);
else
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 7a806ac13e31..8cca1329f348 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -581,6 +581,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
.id = id,
.type = type,
};
+ __be32 status = nfs_ok;
__be32 *p;
int ret;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
@@ -593,12 +594,16 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
return nfserrno(ret);
ret = strlen(item->name);
WARN_ON_ONCE(ret > IDMAP_NAMESZ);
+
p = xdr_reserve_space(xdr, ret + 4);
- if (!p)
- return nfserr_resource;
- p = xdr_encode_opaque(p, item->name, ret);
+ if (unlikely(!p)) {
+ status = nfserr_resource;
+ goto out_put;
+ }
+ xdr_encode_opaque(p, item->name, ret);
+out_put:
cache_put(&item->h, nn->idtoname_cache);
- return 0;
+ return status;
}
static bool
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 4f3072b5979a..fbfddd3c4c94 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -740,6 +740,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
.prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
+ .opcode = OP_CB_LAYOUTRECALL,
};
static bool
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2e39cf2e502a..b5a6bf4f459f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -158,7 +158,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
return fh_verify(rqstp, current_fh, S_IFREG, accmode);
}
-static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+static __be32 nfsd_check_obj_isreg(struct svc_fh *fh, u32 minor_version)
{
umode_t mode = d_inode(fh->fh_dentry)->i_mode;
@@ -166,14 +166,15 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
return nfs_ok;
if (S_ISDIR(mode))
return nfserr_isdir;
- /*
- * Using err_symlink as our catch-all case may look odd; but
- * there's no other obvious error for this case in 4.0, and we
- * happen to know that it will cause the linux v4 client to do
- * the right thing on attempts to open something other than a
- * regular file.
- */
- return nfserr_symlink;
+ if (S_ISLNK(mode))
+ return nfserr_symlink;
+
+ /* RFC 7530 - 16.16.6 */
+ if (minor_version == 0)
+ return nfserr_symlink;
+ else
+ return nfserr_wrong_type;
+
}
static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
@@ -466,7 +467,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
}
if (status)
goto out;
- status = nfsd_check_obj_isreg(*resfh);
+ status = nfsd_check_obj_isreg(*resfh, cstate->minorversion);
if (status)
goto out;
@@ -751,15 +752,6 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&access->ac_supported);
}
-static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
-{
- __be32 *verf = (__be32 *)verifier->data;
-
- BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
-
- nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id));
-}
-
static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -1288,6 +1280,7 @@ static void nfs4_put_copy(struct nfsd4_copy *copy)
{
if (!refcount_dec_and_test(&copy->refcount))
return;
+ atomic_dec(&copy->cp_nn->pending_async_copies);
kfree(copy->cp_src);
kfree(copy);
}
@@ -1621,7 +1614,8 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
.release = nfsd4_cb_offload_release,
- .done = nfsd4_cb_offload_done
+ .done = nfsd4_cb_offload_done,
+ .opcode = OP_CB_OFFLOAD,
};
static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
@@ -1630,7 +1624,6 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
test_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags) ?
NFS_FILE_SYNC : NFS_UNSTABLE;
nfsd4_copy_set_sync(copy, sync);
- gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
}
static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
@@ -1767,7 +1760,7 @@ static int nfsd4_do_async_copy(void *data)
{
struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
- trace_nfsd_copy_do_async(copy);
+ trace_nfsd_copy_async(copy);
if (nfsd4_ssc_is_inter(copy)) {
struct file *filp;
@@ -1794,6 +1787,7 @@ static int nfsd4_do_async_copy(void *data)
do_callback:
set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
+ trace_nfsd_copy_async_done(copy);
nfsd4_send_cb_offload(copy);
cleanup_async_copy(copy);
return 0;
@@ -1803,9 +1797,11 @@ static __be32
nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd4_copy *async_copy = NULL;
struct nfsd4_copy *copy = &u->copy;
+ struct nfsd42_write_res *result;
__be32 status;
- struct nfsd4_copy *async_copy = NULL;
/*
* Currently, async COPY is not reliable. Force all COPY
@@ -1814,6 +1810,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
nfsd4_copy_set_sync(copy, true);
+ result = &copy->cp_res;
+ nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn);
+
copy->cp_clp = cstate->clp;
if (nfsd4_ssc_is_inter(copy)) {
trace_nfsd_copy_inter(copy);
@@ -1838,12 +1837,16 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&copy->fh, &cstate->current_fh.fh_handle,
sizeof(struct knfsd_fh));
if (nfsd4_copy_is_async(copy)) {
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
-
- status = nfserrno(-ENOMEM);
async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!async_copy)
goto out_err;
+ async_copy->cp_nn = nn;
+ /* Arbitrary cap on number of pending async copy operations */
+ if (atomic_inc_return(&nn->pending_async_copies) >
+ (int)rqstp->rq_pool->sp_nrthreads) {
+ atomic_dec(&nn->pending_async_copies);
+ goto out_err;
+ }
INIT_LIST_HEAD(&async_copy->copies);
refcount_set(&async_copy->refcount, 1);
async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
@@ -1851,8 +1854,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out_err;
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
- memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
- sizeof(copy->cp_res.cb_stateid));
+ memcpy(&result->cb_stateid, &copy->cp_stateid.cs_stid,
+ sizeof(result->cb_stateid));
dup_copy_fields(copy, async_copy);
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
async_copy, "%s", "copy thread");
@@ -1883,7 +1886,7 @@ out_err:
}
if (async_copy)
cleanup_async_copy(async_copy);
- status = nfserrno(-ENOMEM);
+ status = nfserr_jukebox;
goto out;
}
@@ -1942,7 +1945,7 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_copy_notify *cn = &u->copy_notify;
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- struct nfs4_stid *stid;
+ struct nfs4_stid *stid = NULL;
struct nfs4_cpntf_state *cps;
struct nfs4_client *clp = cstate->clp;
@@ -1951,6 +1954,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&stid);
if (status)
return status;
+ if (!stid)
+ return nfserr_bad_stateid;
cn->cpn_lease_time.tv_sec = nn->nfsd4_lease;
cn->cpn_lease_time.tv_nsec = 0;
@@ -2231,7 +2236,9 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
return nfserr_noent;
}
- exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+ exp = rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp),
+ rqstp->rq_client, rqstp->rq_gssclient,
+ map->fsid_type, map->fsid);
if (IS_ERR(exp)) {
dprintk("%s: could not find device id\n", __func__);
return nfserr_noent;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 67d8673a9391..b7d61eb8afe9 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -809,6 +809,10 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
ci = &cmsg->cm_u.cm_clntinfo;
if (get_user(namelen, &ci->cc_name.cn_len))
return -EFAULT;
+ if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) {
+ dprintk("%s: invalid namelen (%u)", __func__, namelen);
+ return -EINVAL;
+ }
name.data = memdup_user(&ci->cc_name.cn_id, namelen);
if (IS_ERR(name.data))
return PTR_ERR(name.data);
@@ -831,6 +835,10 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
cnm = &cmsg->cm_u.cm_name;
if (get_user(namelen, &cnm->cn_len))
return -EFAULT;
+ if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) {
+ dprintk("%s: invalid namelen (%u)", __func__, namelen);
+ return -EINVAL;
+ }
name.data = memdup_user(&cnm->cn_id, namelen);
if (IS_ERR(name.data))
return PTR_ERR(name.data);
@@ -1895,10 +1903,7 @@ nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
static void
nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
{
- smp_mb__before_atomic();
- clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
- smp_mb__after_atomic();
- wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
+ clear_and_wake_up_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
}
static void
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a366fb1c1b9b..ac1859c7cc9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -400,6 +400,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
.prepare = nfsd4_cb_notify_lock_prepare,
.done = nfsd4_cb_notify_lock_done,
.release = nfsd4_cb_notify_lock_release,
+ .opcode = OP_CB_NOTIFY_LOCK,
};
/*
@@ -1077,7 +1078,8 @@ static void nfs4_free_deleg(struct nfs4_stid *stid)
* When a delegation is recalled, the filehandle is stored in the "new"
* filter.
* Every 30 seconds we swap the filters and clear the "new" one,
- * unless both are empty of course.
+ * unless both are empty of course. This results in delegations for a
+ * given filehandle being blocked for between 30 and 60 seconds.
*
* Each filter is 256 bits. We hash the filehandle to 32bit and use the
* low 3 bytes as hash-table indices.
@@ -1106,9 +1108,9 @@ static int delegation_blocked(struct knfsd_fh *fh)
if (ktime_get_seconds() - bd->swap_time > 30) {
bd->entries -= bd->old_entries;
bd->old_entries = bd->entries;
+ bd->new = 1-bd->new;
memset(bd->set[bd->new], 0,
sizeof(bd->set[0]));
- bd->new = 1-bd->new;
bd->swap_time = ktime_get_seconds();
}
spin_unlock(&blocked_delegations_lock);
@@ -1663,9 +1665,7 @@ static void release_openowner(struct nfs4_openowner *oo)
{
struct nfs4_ol_stateid *stp;
struct nfs4_client *clp = oo->oo_owner.so_client;
- struct list_head reaplist;
-
- INIT_LIST_HEAD(&reaplist);
+ LIST_HEAD(reaplist);
spin_lock(&clp->cl_lock);
unhash_openowner_locked(oo);
@@ -2369,9 +2369,8 @@ __destroy_client(struct nfs4_client *clp)
int i;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
- struct list_head reaplist;
+ LIST_HEAD(reaplist);
- INIT_LIST_HEAD(&reaplist);
spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
@@ -2692,7 +2691,7 @@ static int client_info_show(struct seq_file *m, void *v)
clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
}
seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
- seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+ seq_printf(m, "callback address: \"%pISpc\"\n", &clp->cl_cb_conn.cb_addr);
seq_printf(m, "admin-revoked states: %d\n",
atomic_read(&clp->cl_admin_revoked));
drop_client(clp);
@@ -3059,7 +3058,10 @@ nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_cb_fattr *ncf =
container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+ struct nfs4_delegation *dp =
+ container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+ trace_nfsd_cb_getattr_done(&dp->dl_stid.sc_stateid, task);
ncf->ncf_cb_status = task->tk_status;
switch (task->tk_status) {
case -NFS4ERR_DELAY:
@@ -3078,19 +3080,20 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
struct nfs4_delegation *dp =
container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
- clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
- wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+ clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
nfs4_put_stid(&dp->dl_stid);
}
static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
.done = nfsd4_cb_recall_any_done,
.release = nfsd4_cb_recall_any_release,
+ .opcode = OP_CB_RECALL_ANY,
};
static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
.done = nfsd4_cb_getattr_done,
.release = nfsd4_cb_getattr_release,
+ .opcode = OP_CB_GETATTR,
};
static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
@@ -4704,6 +4707,7 @@ void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
if (so != NULL) {
cstate->replay_owner = NULL;
atomic_set(&so->so_replay.rp_locked, RP_UNLOCKED);
+ smp_mb__after_atomic();
wake_up_var(&so->so_replay.rp_locked);
nfs4_put_stateowner(so);
}
@@ -5004,6 +5008,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
* so tell them to stop waiting.
*/
atomic_set(&oo->oo_owner.so_replay.rp_locked, RP_UNHASHED);
+ smp_mb__after_atomic();
wake_up_var(&oo->oo_owner.so_replay.rp_locked);
wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2);
@@ -5218,6 +5223,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
.prepare = nfsd4_cb_recall_prepare,
.done = nfsd4_cb_recall_done,
.release = nfsd4_cb_recall_release,
+ .opcode = OP_CB_RECALL,
};
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
@@ -5277,11 +5283,8 @@ static bool nfsd_breaker_owns_lease(struct file_lease *fl)
struct svc_rqst *rqst;
struct nfs4_client *clp;
- if (!i_am_nfsd())
- return false;
- rqst = kthread_data(current);
- /* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
- if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
+ rqst = nfsd_current_rqst();
+ if (!nfsd_v4client(rqst))
return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
@@ -5859,7 +5862,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
/*
* Now that the deleg is set, check again to ensure that nothing
- * raced in and changed the mode while we weren't lookng.
+ * raced in and changed the mode while we weren't looking.
*/
status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file);
if (status)
@@ -5912,6 +5915,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
}
}
+static bool
+nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
+ struct kstat *stat)
+{
+ struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file);
+ struct path path;
+ int rc;
+
+ if (!nf)
+ return false;
+
+ path.mnt = currentfh->fh_export->ex_path.mnt;
+ path.dentry = file_dentry(nf->nf_file);
+
+ rc = vfs_getattr(&path, stat,
+ (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+ AT_STATX_SYNC_AS_STAT);
+
+ nfsd_file_put(nf);
+ return rc == 0;
+}
+
/*
* The Linux NFS server does not offer write delegations to NFSv4.0
* clients in order to avoid conflicts between write delegations and
@@ -5947,7 +5972,6 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
int cb_up;
int status = 0;
struct kstat stat;
- struct path path;
cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
open->op_recall = false;
@@ -5983,20 +6007,16 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
- open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
- trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
- path.mnt = currentfh->fh_export->ex_path.mnt;
- path.dentry = currentfh->fh_dentry;
- if (vfs_getattr(&path, &stat,
- (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
- AT_STATX_SYNC_AS_STAT)) {
+ if (!nfs4_delegation_stat(dp, currentfh, &stat)) {
nfs4_put_stid(&dp->dl_stid);
destroy_delegation(dp);
goto out_no_deleg;
}
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
dp->dl_cb_fattr.ncf_initial_cinfo =
nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
+ trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
} else {
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -6271,7 +6291,6 @@ void nfsd4_ssc_init_umount_work(struct nfsd_net *nn)
INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list);
init_waitqueue_head(&nn->nfsd_ssc_waitq);
}
-EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
/*
* This is called when nfsd is being shutdown, after all inter_ssc
@@ -6619,9 +6638,8 @@ deleg_reaper(struct nfsd_net *nn)
{
struct list_head *pos, *next;
struct nfs4_client *clp;
- struct list_head cblist;
+ LIST_HEAD(cblist);
- INIT_LIST_HEAD(&cblist);
spin_lock(&nn->client_lock);
list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
@@ -6647,7 +6665,6 @@ deleg_reaper(struct nfsd_net *nn)
cl_ra_cblist);
list_del_init(&clp->cl_ra_cblist);
clp->cl_ra->ra_keep = 0;
- clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
BIT(RCA4_TYPE_MASK_WDATA_DLG);
trace_nfsd_cb_recall_any(clp->cl_ra);
@@ -6892,7 +6909,8 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
nf = nfs4_find_file(s, flags);
if (nf) {
- status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ status = nfsd_permission(&rqstp->rq_cred,
+ fhp->fh_export, fhp->fh_dentry,
acc | NFSD_MAY_OWNER_OVERRIDE);
if (status) {
nfsd_file_put(nf);
@@ -7023,11 +7041,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
*nfp = NULL;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
- if (cstid)
- status = nfserr_bad_stateid;
- else
- status = check_special_stateids(net, fhp, stateid,
- flags);
+ status = check_special_stateids(net, fhp, stateid, flags);
goto done;
}
@@ -7481,8 +7495,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto put_stateid;
trace_nfsd_deleg_return(stateid);
- wake_up_var(d_inode(cstate->current_fh.fh_dentry));
destroy_delegation(dp);
+ smp_mb__after_atomic();
+ wake_up_var(d_inode(cstate->current_fh.fh_dentry));
put_stateid:
nfs4_put_stid(&dp->dl_stid);
out:
@@ -8338,7 +8353,7 @@ out:
* @cstate: NFSv4 COMPOUND state
* @u: RELEASE_LOCKOWNER arguments
*
- * Check if theree are any locks still held and if not - free the lockowner
+ * Check if there are any locks still held and if not, free the lockowner
* and any lock state that is owned.
*
* Return values:
@@ -8557,6 +8572,7 @@ static int nfs4_state_create_net(struct net *net)
spin_lock_init(&nn->client_lock);
spin_lock_init(&nn->s2s_cp_lock);
idr_init(&nn->s2s_cp_stateids);
+ atomic_set(&nn->pending_async_copies, 0);
spin_lock_init(&nn->blocked_locks_lock);
INIT_LIST_HEAD(&nn->blocked_locks_lru);
@@ -8836,6 +8852,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file_lock_context *ctx;
+ struct nfs4_delegation *dp = NULL;
struct file_lease *fl;
struct iattr attrs;
struct nfs4_cb_fattr *ncf;
@@ -8845,84 +8862,76 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
ctx = locks_inode_context(inode);
if (!ctx)
return 0;
+
+#define NON_NFSD_LEASE ((void *)1)
+
spin_lock(&ctx->flc_lock);
for_each_file_lock(fl, &ctx->flc_lease) {
- unsigned char type = fl->c.flc_type;
-
if (fl->c.flc_flags == FL_LAYOUT)
continue;
- if (fl->fl_lmops != &nfsd_lease_mng_ops) {
- /*
- * non-nfs lease, if it's a lease with F_RDLCK then
- * we are done; there isn't any write delegation
- * on this inode
- */
- if (type == F_RDLCK)
- break;
-
- nfsd_stats_wdeleg_getattr_inc(nn);
- spin_unlock(&ctx->flc_lock);
-
- status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (fl->c.flc_type == F_WRLCK) {
+ if (fl->fl_lmops == &nfsd_lease_mng_ops)
+ dp = fl->c.flc_owner;
+ else
+ dp = NON_NFSD_LEASE;
+ }
+ break;
+ }
+ if (dp == NULL || dp == NON_NFSD_LEASE ||
+ dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
+ spin_unlock(&ctx->flc_lock);
+ if (dp == NON_NFSD_LEASE) {
+ status = nfserrno(nfsd_open_break_lease(inode,
+ NFSD_MAY_READ));
if (status != nfserr_jukebox ||
!nfsd_wait_for_delegreturn(rqstp, inode))
return status;
- return 0;
}
- if (type == F_WRLCK) {
- struct nfs4_delegation *dp = fl->c.flc_owner;
+ return 0;
+ }
- if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
- spin_unlock(&ctx->flc_lock);
- return 0;
- }
- nfsd_stats_wdeleg_getattr_inc(nn);
- dp = fl->c.flc_owner;
- refcount_inc(&dp->dl_stid.sc_count);
- ncf = &dp->dl_cb_fattr;
- nfs4_cb_getattr(&dp->dl_cb_fattr);
- spin_unlock(&ctx->flc_lock);
- wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
- TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
- if (ncf->ncf_cb_status) {
- /* Recall delegation only if client didn't respond */
- status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
- if (status != nfserr_jukebox ||
- !nfsd_wait_for_delegreturn(rqstp, inode)) {
- nfs4_put_stid(&dp->dl_stid);
- return status;
- }
- }
- if (!ncf->ncf_file_modified &&
- (ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
- ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
- ncf->ncf_file_modified = true;
- if (ncf->ncf_file_modified) {
- int err;
-
- /*
- * Per section 10.4.3 of RFC 8881, the server would
- * not update the file's metadata with the client's
- * modified size
- */
- attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
- attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG;
- inode_lock(inode);
- err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
- inode_unlock(inode);
- if (err) {
- nfs4_put_stid(&dp->dl_stid);
- return nfserrno(err);
- }
- ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
- *size = ncf->ncf_cur_fsize;
- *modified = true;
- }
- nfs4_put_stid(&dp->dl_stid);
- return 0;
+ nfsd_stats_wdeleg_getattr_inc(nn);
+ refcount_inc(&dp->dl_stid.sc_count);
+ ncf = &dp->dl_cb_fattr;
+ nfs4_cb_getattr(&dp->dl_cb_fattr);
+ spin_unlock(&ctx->flc_lock);
+
+ wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
+ TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
+ if (ncf->ncf_cb_status) {
+ /* Recall delegation only if client didn't respond */
+ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (status != nfserr_jukebox ||
+ !nfsd_wait_for_delegreturn(rqstp, inode))
+ goto out_status;
+ }
+ if (!ncf->ncf_file_modified &&
+ (ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+ ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+ ncf->ncf_file_modified = true;
+ if (ncf->ncf_file_modified) {
+ int err;
+
+ /*
+ * Per section 10.4.3 of RFC 8881, the server would
+ * not update the file's metadata with the client's
+ * modified size
+ */
+ attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+ attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG;
+ inode_lock(inode);
+ err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
+ inode_unlock(inode);
+ if (err) {
+ status = nfserrno(err);
+ goto out_status;
}
- break;
+ ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+ *size = ncf->ncf_cur_fsize;
+ *modified = true;
}
- spin_unlock(&ctx->flc_lock);
- return 0;
+ status = 0;
+out_status:
+ nfs4_put_stid(&dp->dl_stid);
+ return status;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 97f583777972..f118921250c3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1246,14 +1246,6 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
}
static __be32
-nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
-{
- if (argp->minorversion == 0)
- return nfs_ok;
- return nfserr_notsupp;
-}
-
-static __be32
nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
struct nfsd4_read *read = &u->read;
@@ -2374,7 +2366,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
[OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm,
[OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade,
[OP_PUTFH] = nfsd4_decode_putfh,
- [OP_PUTPUBFH] = nfsd4_decode_putpubfh,
+ [OP_PUTPUBFH] = nfsd4_decode_noop,
[OP_PUTROOTFH] = nfsd4_decode_noop,
[OP_READ] = nfsd4_decode_read,
[OP_READDIR] = nfsd4_decode_readdir,
@@ -5731,6 +5723,23 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
return nfserr_rep_too_big;
}
+static __be32 nfsd4_map_status(__be32 status, u32 minor)
+{
+ switch (status) {
+ case nfs_ok:
+ break;
+ case nfserr_wrong_type:
+ /* RFC 8881 - 15.1.2.9 */
+ if (minor == 0)
+ status = nfserr_inval;
+ break;
+ case nfserr_symlink_not_dir:
+ status = nfserr_symlink;
+ break;
+ }
+ return status;
+}
+
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
@@ -5798,6 +5807,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
so->so_replay.rp_buf, len);
}
status:
+ op->status = nfsd4_map_status(op->status,
+ resp->cstate.minorversion);
*p = op->status;
release:
if (opdesc && opdesc->op_release)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 34eb2c2cbcde..3adbc05ebaac 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/module.h>
#include <linux/fsnotify.h>
+#include <linux/nfslocalio.h>
#include "idmap.h"
#include "nfsd.h"
@@ -174,6 +175,13 @@ static int export_features_show(struct seq_file *m, void *v)
DEFINE_SHOW_ATTRIBUTE(export_features);
+static int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+ struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
+
+ return svc_pool_stats_open(&nn->nfsd_info, file);
+}
+
static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
@@ -1762,7 +1770,7 @@ int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info)
struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i];
err = nla_put_u32(skb, NFSD_A_SERVER_THREADS,
- atomic_read(&sp->sp_nrthreads));
+ sp->sp_nrthreads);
if (err)
goto err_unlock;
}
@@ -2224,8 +2232,9 @@ err_free_msg:
*/
static __net_init int nfsd_net_init(struct net *net)
{
- int retval;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int retval;
+ int i;
retval = nfsd_export_init(net);
if (retval)
@@ -2238,16 +2247,20 @@ static __net_init int nfsd_net_init(struct net *net)
if (retval)
goto out_repcache_error;
memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
- nn->nfsd_svcstats.program = &nfsd_program;
- nn->nfsd_versions = NULL;
- nn->nfsd4_minorversions = NULL;
+ nn->nfsd_svcstats.program = &nfsd_programs[0];
+ for (i = 0; i < sizeof(nn->nfsd_versions); i++)
+ nn->nfsd_versions[i] = nfsd_support_version(i);
+ for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++)
+ nn->nfsd4_minorversions[i] = nfsd_support_version(4);
nn->nfsd_info.mutex = &nfsd_mutex;
nn->nfsd_serv = NULL;
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
nfsd_proc_stat_init(net);
-
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ INIT_LIST_HEAD(&nn->local_clients);
+#endif
return 0;
out_repcache_error:
@@ -2258,6 +2271,22 @@ out_export_error:
return retval;
}
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+/**
+ * nfsd_net_pre_exit - Disconnect localio clients from net namespace
+ * @net: a network namespace that is about to be destroyed
+ *
+ * This invalidated ->net pointers held by localio clients
+ * while they can still safely access nn->counter.
+ */
+static __net_exit void nfsd_net_pre_exit(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs_uuid_invalidate_clients(&nn->local_clients);
+}
+#endif
+
/**
* nfsd_net_exit - Release the nfsd_net portion of a net namespace
* @net: a network namespace that is about to be destroyed
@@ -2271,11 +2300,13 @@ static __net_exit void nfsd_net_exit(struct net *net)
percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
- nfsd_netns_free_versions(nn);
}
static struct pernet_operations nfsd_net_ops = {
.init = nfsd_net_init,
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ .pre_exit = nfsd_net_pre_exit,
+#endif
.exit = nfsd_net_exit,
.id = &nfsd_net_id,
.size = sizeof(struct nfsd_net),
@@ -2313,6 +2344,7 @@ static int __init init_nfsd(void)
retval = genl_register_family(&nfsd_nl_family);
if (retval)
goto out_free_all;
+ nfsd_localio_ops_init();
return 0;
out_free_all:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index cec8697b1cd6..4b56ba1e8e48 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -23,9 +23,7 @@
#include <uapi/linux/nfsd/debug.h>
-#include "netns.h"
#include "export.h"
-#include "stats.h"
#undef ifdebug
#ifdef CONFIG_SUNRPC_DEBUG
@@ -37,7 +35,14 @@
/*
* nfsd version
*/
+#define NFSD_MINVERS 2
+#define NFSD_MAXVERS 4
#define NFSD_SUPPORTED_MINOR_VERSION 2
+bool nfsd_support_version(int vers);
+
+#include "netns.h"
+#include "stats.h"
+
/*
* Maximum blocksizes supported by daemon under various circumstances.
*/
@@ -80,7 +85,7 @@ struct nfsd_genl_rqstp {
u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
};
-extern struct svc_program nfsd_program;
+extern struct svc_program nfsd_programs[];
extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
@@ -111,11 +116,9 @@ int nfsd_nrthreads(struct net *);
int nfsd_nrpools(struct net *);
int nfsd_get_nrthreads(int n, int *, struct net *);
int nfsd_set_nrthreads(int n, int *, struct net *);
-int nfsd_pool_stats_open(struct inode *, struct file *);
-int nfsd_pool_stats_release(struct inode *, struct file *);
void nfsd_shutdown_threads(struct net *net);
-bool i_am_nfsd(void);
+struct svc_rqst *nfsd_current_rqst(void);
struct nfsdfs_client {
struct kref cl_ref;
@@ -143,6 +146,10 @@ extern const struct svc_version nfsd_acl_version3;
#endif
#endif
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+extern const struct svc_version localio_version1;
+#endif
+
struct nfsd_net;
enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
@@ -156,7 +163,7 @@ extern int nfsd_max_blksize;
static inline int nfsd_v4client(struct svc_rqst *rq)
{
- return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+ return rq && rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
}
static inline struct user_namespace *
nfsd_user_namespace(const struct svc_rqst *rqstp)
@@ -327,17 +334,36 @@ void nfsd_lockd_shutdown(void);
#define nfserr_xattr2big cpu_to_be32(NFS4ERR_XATTR2BIG)
#define nfserr_noxattr cpu_to_be32(NFS4ERR_NOXATTR)
-/* error codes for internal use */
+/*
+ * Error codes for internal use. We use enum to choose numbers that are
+ * not already assigned, then covert to be32 resulting in a number that
+ * cannot conflict with any existing be32 nfserr value.
+ */
+enum {
+ NFSERR_DROPIT = NFS4ERR_FIRST_FREE,
/* if a request fails due to kmalloc failure, it gets dropped.
* Client should resend eventually
*/
-#define nfserr_dropit cpu_to_be32(30000)
+#define nfserr_dropit cpu_to_be32(NFSERR_DROPIT)
+
/* end-of-file indicator in readdir */
-#define nfserr_eof cpu_to_be32(30001)
+ NFSERR_EOF,
+#define nfserr_eof cpu_to_be32(NFSERR_EOF)
+
/* replay detected */
-#define nfserr_replay_me cpu_to_be32(11001)
+ NFSERR_REPLAY_ME,
+#define nfserr_replay_me cpu_to_be32(NFSERR_REPLAY_ME)
+
/* nfs41 replay detected */
-#define nfserr_replay_cache cpu_to_be32(11002)
+ NFSERR_REPLAY_CACHE,
+#define nfserr_replay_cache cpu_to_be32(NFSERR_REPLAY_CACHE)
+
+/* symlink found where dir expected - handled differently to
+ * other symlink found errors by NFSv3.
+ */
+ NFSERR_SYMLINK_NOT_DIR,
+#define nfserr_symlink_not_dir cpu_to_be32(NFSERR_SYMLINK_NOT_DIR)
+};
/* Check for dir entries '.' and '..' */
#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index dd4e11a703aa..40ad58a6a036 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -62,8 +62,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
- umode_t requested)
+nfsd_mode_check(struct dentry *dentry, umode_t requested)
{
umode_t mode = d_inode(dentry)->i_mode & S_IFMT;
@@ -76,36 +75,36 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
}
return nfs_ok;
}
- /*
- * v4 has an error more specific than err_notdir which we should
- * return in preference to err_notdir:
- */
- if (rqstp->rq_vers == 4 && mode == S_IFLNK)
+ if (mode == S_IFLNK) {
+ if (requested == S_IFDIR)
+ return nfserr_symlink_not_dir;
return nfserr_symlink;
+ }
if (requested == S_IFDIR)
return nfserr_notdir;
if (mode == S_IFDIR)
return nfserr_isdir;
- return nfserr_inval;
+ return nfserr_wrong_type;
}
-static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags)
+static bool nfsd_originating_port_ok(struct svc_rqst *rqstp,
+ struct svc_cred *cred,
+ struct svc_export *exp)
{
- if (flags & NFSEXP_INSECURE_PORT)
+ if (nfsexp_flags(cred, exp) & NFSEXP_INSECURE_PORT)
return true;
/* We don't require gss requests to use low ports: */
- if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS)
+ if (cred->cr_flavor >= RPC_AUTH_GSS)
return true;
return test_bit(RQ_SECURE, &rqstp->rq_flags);
}
static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
+ struct svc_cred *cred,
struct svc_export *exp)
{
- int flags = nfsexp_flags(rqstp, exp);
-
/* Check if the request originated from a secure port. */
- if (!nfsd_originating_port_ok(rqstp, flags)) {
+ if (rqstp && !nfsd_originating_port_ok(rqstp, cred, exp)) {
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
dprintk("nfsd: request from insecure port %s!\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
@@ -113,23 +112,15 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
}
/* Set user creds for this exportpoint */
- return nfserrno(nfsd_setuser(rqstp, exp));
+ return nfserrno(nfsd_setuser(cred, exp));
}
-static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
- struct dentry *dentry, struct svc_export *exp)
+static inline __be32 check_pseudo_root(struct dentry *dentry,
+ struct svc_export *exp)
{
if (!(exp->ex_flags & NFSEXP_V4ROOT))
return nfs_ok;
/*
- * v2/v3 clients have no need for the V4ROOT export--they use
- * the mount protocl instead; also, further V4ROOT checks may be
- * in v4-specific code, in which case v2/v3 clients could bypass
- * them.
- */
- if (!nfsd_v4client(rqstp))
- return nfserr_stale;
- /*
* We're exposing only the directories and symlinks that have to be
* traversed on the way to real exports:
*/
@@ -151,7 +142,11 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
* dentry. On success, the results are used to set fh_export and
* fh_dentry.
*/
-static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
+ struct svc_cred *cred,
+ struct auth_domain *client,
+ struct auth_domain *gssclient,
+ struct svc_fh *fhp)
{
struct knfsd_fh *fh = &fhp->fh_handle;
struct fid *fid = NULL;
@@ -162,10 +157,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
int len;
__be32 error;
- error = nfserr_stale;
- if (rqstp->rq_vers > 2)
- error = nfserr_badhandle;
- if (rqstp->rq_vers == 4 && fh->fh_size == 0)
+ error = nfserr_badhandle;
+ if (fh->fh_size == 0)
return nfserr_nofilehandle;
if (fh->fh_version != 1)
@@ -195,7 +188,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
data_left -= len;
if (data_left < 0)
return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ exp = rqst_exp_find(rqstp ? &rqstp->rq_chandle : NULL,
+ net, client, gssclient,
+ fh->fh_fsid_type, fh->fh_fsid);
fid = (struct fid *)(fh->fh_fsid + len);
error = nfserr_stale;
@@ -229,7 +224,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
put_cred(override_creds(new));
put_cred(new);
} else {
- error = nfsd_setuser_and_check_port(rqstp, exp);
+ error = nfsd_setuser_and_check_port(rqstp, cred, exp);
if (error)
goto out;
}
@@ -237,9 +232,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
/*
* Look up the dentry using the NFS file handle.
*/
- error = nfserr_stale;
- if (rqstp->rq_vers > 2)
- error = nfserr_badhandle;
+ error = nfserr_badhandle;
fileid_type = fh->fh_fileid_type;
@@ -278,17 +271,25 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
fhp->fh_dentry = dentry;
fhp->fh_export = exp;
- switch (rqstp->rq_vers) {
- case 4:
+ switch (fhp->fh_maxsize) {
+ case NFS4_FHSIZE:
if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
fhp->fh_no_atomic_attr = true;
+ fhp->fh_64bit_cookies = true;
break;
- case 3:
+ case NFS3_FHSIZE:
if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC)
fhp->fh_no_wcc = true;
+ fhp->fh_64bit_cookies = true;
+ if (exp->ex_flags & NFSEXP_V4ROOT)
+ goto out;
break;
- case 2:
+ case NFS_FHSIZE:
fhp->fh_no_wcc = true;
+ if (EX_WGATHER(exp))
+ fhp->fh_use_wgather = true;
+ if (exp->ex_flags & NFSEXP_V4ROOT)
+ goto out;
}
return 0;
@@ -298,42 +299,33 @@ out:
}
/**
- * fh_verify - filehandle lookup and access checking
- * @rqstp: pointer to current rpc request
+ * __fh_verify - filehandle lookup and access checking
+ * @rqstp: RPC transaction context, or NULL
+ * @net: net namespace in which to perform the export lookup
+ * @cred: RPC user credential
+ * @client: RPC auth domain
+ * @gssclient: RPC GSS auth domain, or NULL
* @fhp: filehandle to be verified
* @type: expected type of object pointed to by filehandle
* @access: type of access needed to object
*
- * Look up a dentry from the on-the-wire filehandle, check the client's
- * access to the export, and set the current task's credentials.
- *
- * Regardless of success or failure of fh_verify(), fh_put() should be
- * called on @fhp when the caller is finished with the filehandle.
- *
- * fh_verify() may be called multiple times on a given filehandle, for
- * example, when processing an NFSv4 compound. The first call will look
- * up a dentry using the on-the-wire filehandle. Subsequent calls will
- * skip the lookup and just perform the other checks and possibly change
- * the current task's credentials.
- *
- * @type specifies the type of object expected using one of the S_IF*
- * constants defined in include/linux/stat.h. The caller may use zero
- * to indicate that it doesn't care, or a negative integer to indicate
- * that it expects something not of the given type.
- *
- * @access is formed from the NFSD_MAY_* constants defined in
- * fs/nfsd/vfs.h.
+ * See fh_verify() for further descriptions of @fhp, @type, and @access.
*/
-__be32
-fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+static __be32
+__fh_verify(struct svc_rqst *rqstp,
+ struct net *net, struct svc_cred *cred,
+ struct auth_domain *client,
+ struct auth_domain *gssclient,
+ struct svc_fh *fhp, umode_t type, int access)
{
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_export *exp = NULL;
struct dentry *dentry;
__be32 error;
if (!fhp->fh_dentry) {
- error = nfsd_set_fh_dentry(rqstp, fhp);
+ error = nfsd_set_fh_dentry(rqstp, net, cred, client,
+ gssclient, fhp);
if (error)
goto out;
}
@@ -358,15 +350,15 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
* (for example, if different id-squashing options are in
* effect on the new filesystem).
*/
- error = check_pseudo_root(rqstp, dentry, exp);
+ error = check_pseudo_root(dentry, exp);
if (error)
goto out;
- error = nfsd_setuser_and_check_port(rqstp, exp);
+ error = nfsd_setuser_and_check_port(rqstp, cred, exp);
if (error)
goto out;
- error = nfsd_mode_check(rqstp, dentry, type);
+ error = nfsd_mode_check(dentry, type);
if (error)
goto out;
@@ -392,7 +384,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
skip_pseudoflavor_check:
/* Finally, check access permissions. */
- error = nfsd_permission(rqstp, exp, dentry, access);
+ error = nfsd_permission(cred, exp, dentry, access);
out:
trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
if (error == nfserr_stale)
@@ -400,6 +392,63 @@ out:
return error;
}
+/**
+ * fh_verify_local - filehandle lookup and access checking
+ * @net: net namespace in which to perform the export lookup
+ * @cred: RPC user credential
+ * @client: RPC auth domain
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * This API can be used by callers who do not have an RPC
+ * transaction context (ie are not running in an nfsd thread).
+ *
+ * See fh_verify() for further descriptions of @fhp, @type, and @access.
+ */
+__be32
+fh_verify_local(struct net *net, struct svc_cred *cred,
+ struct auth_domain *client, struct svc_fh *fhp,
+ umode_t type, int access)
+{
+ return __fh_verify(NULL, net, cred, client, NULL,
+ fhp, type, access);
+}
+
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
+ *
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound. The first call will look
+ * up a dentry using the on-the-wire filehandle. Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
+ *
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h. The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * fs/nfsd/vfs.h.
+ */
+__be32
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+{
+ return __fh_verify(rqstp, SVC_NET(rqstp), &rqstp->rq_cred,
+ rqstp->rq_client, rqstp->rq_gssclient,
+ fhp, type, access);
+}
/*
* Compose a file handle for an NFS reply.
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 6ebdf7ea27bf..5b7394801dc4 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -88,6 +88,8 @@ typedef struct svc_fh {
* wcc data is not atomic with
* operation
*/
+ bool fh_use_wgather; /* NFSv2 wgather option */
+ bool fh_64bit_cookies;/* readdir cookie size */
int fh_flags; /* FH flags */
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
@@ -215,6 +217,8 @@ extern char * SVCFH_fmt(struct svc_fh *fhp);
* Function prototypes
*/
__be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int);
+__be32 fh_verify_local(struct net *, struct svc_cred *, struct auth_domain *,
+ struct svc_fh *, umode_t, int);
__be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
__be32 fh_update(struct svc_fh *);
void fh_put(struct svc_fh *);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 36370b957b63..6dda081eb24c 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -13,6 +13,31 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
+static __be32 nfsd_map_status(__be32 status)
+{
+ switch (status) {
+ case nfs_ok:
+ break;
+ case nfserr_nofilehandle:
+ case nfserr_badhandle:
+ status = nfserr_stale;
+ break;
+ case nfserr_wrongsec:
+ case nfserr_xdev:
+ case nfserr_file_open:
+ status = nfserr_acces;
+ break;
+ case nfserr_symlink_not_dir:
+ status = nfserr_notdir;
+ break;
+ case nfserr_symlink:
+ case nfserr_wrong_type:
+ status = nfserr_inval;
+ break;
+ }
+ return status;
+}
+
static __be32
nfsd_proc_null(struct svc_rqst *rqstp)
{
@@ -38,6 +63,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp)
goto out;
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -109,6 +135,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -143,6 +170,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -164,6 +192,7 @@ nfsd_proc_readlink(struct svc_rqst *rqstp)
page_address(resp->page), &resp->len);
fh_put(&argp->fh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -200,6 +229,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
set_bit(RQ_DROPME, &rqstp->rq_flags);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -235,6 +265,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
set_bit(RQ_DROPME, &rqstp->rq_flags);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -331,10 +362,11 @@ nfsd_proc_create(struct svc_rqst *rqstp)
* echo thing > device-special-file-or-pipe
* by doing a CREATE with type==0
*/
- resp->status = nfsd_permission(rqstp,
- newfhp->fh_export,
- newfhp->fh_dentry,
- NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
+ resp->status = nfsd_permission(
+ &rqstp->rq_cred,
+ newfhp->fh_export,
+ newfhp->fh_dentry,
+ NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
if (resp->status && resp->status != nfserr_rofs)
goto out_unlock;
}
@@ -403,6 +435,7 @@ done:
goto out;
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -419,6 +452,7 @@ nfsd_proc_remove(struct svc_rqst *rqstp)
resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR,
argp->name, argp->len);
fh_put(&argp->fh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -437,6 +471,7 @@ nfsd_proc_rename(struct svc_rqst *rqstp)
&argp->tfh, argp->tname, argp->tlen);
fh_put(&argp->ffh);
fh_put(&argp->tfh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -457,6 +492,7 @@ nfsd_proc_link(struct svc_rqst *rqstp)
&argp->ffh);
fh_put(&argp->ffh);
fh_put(&argp->tfh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -495,6 +531,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
fh_put(&argp->ffh);
fh_put(&newfh);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -528,6 +565,7 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
resp->status = fh_getattr(&resp->fh, &resp->stat);
out:
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -545,6 +583,7 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR,
argp->name, argp->len);
fh_put(&argp->fh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -590,6 +629,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
nfssvc_encode_nfscookie(resp, offset);
fh_put(&argp->fh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
@@ -607,6 +647,7 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
+ resp->status = nfsd_map_status(resp->status);
return rpc_success;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 0bc8eaa5e009..e236135ddc63 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -19,6 +19,7 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
+#include <linux/nfslocalio.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
#include <net/addrconf.h>
@@ -35,7 +36,6 @@
#define NFSDDBG_FACILITY NFSDDBG_SVC
atomic_t nfsd_th_cnt = ATOMIC_INIT(0);
-extern struct svc_program nfsd_program;
static int nfsd(void *vrqstp);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static int nfsd_acl_rpcbind_set(struct net *,
@@ -80,6 +80,15 @@ DEFINE_SPINLOCK(nfsd_drc_lock);
unsigned long nfsd_drc_max_mem;
unsigned long nfsd_drc_mem_used;
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+static const struct svc_version *localio_versions[] = {
+ [1] = &localio_version1,
+};
+
+#define NFSD_LOCALIO_NRVERS ARRAY_SIZE(localio_versions)
+
+#endif /* CONFIG_NFS_LOCALIO */
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static const struct svc_version *nfsd_acl_version[] = {
# if defined(CONFIG_NFSD_V2_ACL)
@@ -90,23 +99,12 @@ static const struct svc_version *nfsd_acl_version[] = {
# endif
};
-#define NFSD_ACL_MINVERS 2
+#define NFSD_ACL_MINVERS 2
#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version)
-static struct svc_program nfsd_acl_program = {
- .pg_prog = NFS_ACL_PROGRAM,
- .pg_nvers = NFSD_ACL_NRVERS,
- .pg_vers = nfsd_acl_version,
- .pg_name = "nfsacl",
- .pg_class = "nfsd",
- .pg_authenticate = &svc_set_client,
- .pg_init_request = nfsd_acl_init_request,
- .pg_rpcbind_set = nfsd_acl_rpcbind_set,
-};
-
#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
-static const struct svc_version *nfsd_version[] = {
+static const struct svc_version *nfsd_version[NFSD_MAXVERS+1] = {
#if defined(CONFIG_NFSD_V2)
[2] = &nfsd_version2,
#endif
@@ -116,97 +114,63 @@ static const struct svc_version *nfsd_version[] = {
#endif
};
-#define NFSD_MINVERS 2
-#define NFSD_NRVERS ARRAY_SIZE(nfsd_version)
-
-struct svc_program nfsd_program = {
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
- .pg_next = &nfsd_acl_program,
-#endif
+struct svc_program nfsd_programs[] = {
+ {
.pg_prog = NFS_PROGRAM, /* program number */
- .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */
+ .pg_nvers = NFSD_MAXVERS+1, /* nr of entries in nfsd_version */
.pg_vers = nfsd_version, /* version table */
.pg_name = "nfsd", /* program name */
.pg_class = "nfsd", /* authentication class */
- .pg_authenticate = &svc_set_client, /* export authentication */
+ .pg_authenticate = svc_set_client, /* export authentication */
.pg_init_request = nfsd_init_request,
.pg_rpcbind_set = nfsd_rpcbind_set,
+ },
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+ {
+ .pg_prog = NFS_ACL_PROGRAM,
+ .pg_nvers = NFSD_ACL_NRVERS,
+ .pg_vers = nfsd_acl_version,
+ .pg_name = "nfsacl",
+ .pg_class = "nfsd",
+ .pg_authenticate = svc_set_client,
+ .pg_init_request = nfsd_acl_init_request,
+ .pg_rpcbind_set = nfsd_acl_rpcbind_set,
+ },
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+ {
+ .pg_prog = NFS_LOCALIO_PROGRAM,
+ .pg_nvers = NFSD_LOCALIO_NRVERS,
+ .pg_vers = localio_versions,
+ .pg_name = "nfslocalio",
+ .pg_class = "nfsd",
+ .pg_authenticate = svc_set_client,
+ .pg_init_request = svc_generic_init_request,
+ .pg_rpcbind_set = svc_generic_rpcbind_set,
+ }
+#endif /* CONFIG_NFS_LOCALIO */
};
bool nfsd_support_version(int vers)
{
- if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS)
+ if (vers >= NFSD_MINVERS && vers <= NFSD_MAXVERS)
return nfsd_version[vers] != NULL;
return false;
}
-static bool *
-nfsd_alloc_versions(void)
-{
- bool *vers = kmalloc_array(NFSD_NRVERS, sizeof(bool), GFP_KERNEL);
- unsigned i;
-
- if (vers) {
- /* All compiled versions are enabled by default */
- for (i = 0; i < NFSD_NRVERS; i++)
- vers[i] = nfsd_support_version(i);
- }
- return vers;
-}
-
-static bool *
-nfsd_alloc_minorversions(void)
-{
- bool *vers = kmalloc_array(NFSD_SUPPORTED_MINOR_VERSION + 1,
- sizeof(bool), GFP_KERNEL);
- unsigned i;
-
- if (vers) {
- /* All minor versions are enabled by default */
- for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
- vers[i] = nfsd_support_version(4);
- }
- return vers;
-}
-
-void
-nfsd_netns_free_versions(struct nfsd_net *nn)
-{
- kfree(nn->nfsd_versions);
- kfree(nn->nfsd4_minorversions);
- nn->nfsd_versions = NULL;
- nn->nfsd4_minorversions = NULL;
-}
-
-static void
-nfsd_netns_init_versions(struct nfsd_net *nn)
-{
- if (!nn->nfsd_versions) {
- nn->nfsd_versions = nfsd_alloc_versions();
- nn->nfsd4_minorversions = nfsd_alloc_minorversions();
- if (!nn->nfsd_versions || !nn->nfsd4_minorversions)
- nfsd_netns_free_versions(nn);
- }
-}
-
int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change)
{
- if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
+ if (vers < NFSD_MINVERS || vers > NFSD_MAXVERS)
return 0;
switch(change) {
case NFSD_SET:
- if (nn->nfsd_versions)
- nn->nfsd_versions[vers] = nfsd_support_version(vers);
+ nn->nfsd_versions[vers] = nfsd_support_version(vers);
break;
case NFSD_CLEAR:
- nfsd_netns_init_versions(nn);
- if (nn->nfsd_versions)
- nn->nfsd_versions[vers] = false;
+ nn->nfsd_versions[vers] = false;
break;
case NFSD_TEST:
- if (nn->nfsd_versions)
- return nn->nfsd_versions[vers];
- fallthrough;
+ return nn->nfsd_versions[vers];
case NFSD_AVAIL:
return nfsd_support_version(vers);
}
@@ -233,23 +197,16 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change
switch(change) {
case NFSD_SET:
- if (nn->nfsd4_minorversions) {
- nfsd_vers(nn, 4, NFSD_SET);
- nn->nfsd4_minorversions[minorversion] =
- nfsd_vers(nn, 4, NFSD_TEST);
- }
+ nfsd_vers(nn, 4, NFSD_SET);
+ nn->nfsd4_minorversions[minorversion] =
+ nfsd_vers(nn, 4, NFSD_TEST);
break;
case NFSD_CLEAR:
- nfsd_netns_init_versions(nn);
- if (nn->nfsd4_minorversions) {
- nn->nfsd4_minorversions[minorversion] = false;
- nfsd_adjust_nfsd_versions4(nn);
- }
+ nn->nfsd4_minorversions[minorversion] = false;
+ nfsd_adjust_nfsd_versions4(nn);
break;
case NFSD_TEST:
- if (nn->nfsd4_minorversions)
- return nn->nfsd4_minorversions[minorversion];
- return nfsd_vers(nn, 4, NFSD_TEST);
+ return nn->nfsd4_minorversions[minorversion];
case NFSD_AVAIL:
return minorversion <= NFSD_SUPPORTED_MINOR_VERSION &&
nfsd_vers(nn, 4, NFSD_AVAIL);
@@ -257,6 +214,34 @@ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change
return 0;
}
+bool nfsd_serv_try_get(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ return (nn && percpu_ref_tryget_live(&nn->nfsd_serv_ref));
+}
+
+void nfsd_serv_put(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ percpu_ref_put(&nn->nfsd_serv_ref);
+}
+
+static void nfsd_serv_done(struct percpu_ref *ref)
+{
+ struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+
+ complete(&nn->nfsd_serv_confirm_done);
+}
+
+static void nfsd_serv_free(struct percpu_ref *ref)
+{
+ struct nfsd_net *nn = container_of(ref, struct nfsd_net, nfsd_serv_ref);
+
+ complete(&nn->nfsd_serv_free_done);
+}
+
/*
* Maximum number of nfsd processes
*/
@@ -456,6 +441,7 @@ static void nfsd_shutdown_net(struct net *net)
lockd_down(net);
nn->lockd_up = false;
}
+ percpu_ref_exit(&nn->nfsd_serv_ref);
nn->nfsd_net_up = false;
nfsd_shutdown_generic();
}
@@ -535,6 +521,13 @@ void nfsd_destroy_serv(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv = nn->nfsd_serv;
+ lockdep_assert_held(&nfsd_mutex);
+
+ percpu_ref_kill_and_confirm(&nn->nfsd_serv_ref, nfsd_serv_done);
+ wait_for_completion(&nn->nfsd_serv_confirm_done);
+ wait_for_completion(&nn->nfsd_serv_free_done);
+ /* percpu_ref_exit is called in nfsd_shutdown_net */
+
spin_lock(&nfsd_notifier_lock);
nn->nfsd_serv = NULL;
spin_unlock(&nfsd_notifier_lock);
@@ -568,11 +561,11 @@ void nfsd_reset_versions(struct nfsd_net *nn)
{
int i;
- for (i = 0; i < NFSD_NRVERS; i++)
+ for (i = 0; i <= NFSD_MAXVERS; i++)
if (nfsd_vers(nn, i, NFSD_TEST))
return;
- for (i = 0; i < NFSD_NRVERS; i++)
+ for (i = 0; i <= NFSD_MAXVERS; i++)
if (i != 4)
nfsd_vers(nn, i, NFSD_SET);
else {
@@ -642,9 +635,11 @@ void nfsd_shutdown_threads(struct net *net)
mutex_unlock(&nfsd_mutex);
}
-bool i_am_nfsd(void)
+struct svc_rqst *nfsd_current_rqst(void)
{
- return kthread_func(current) == nfsd;
+ if (kthread_func(current) == nfsd)
+ return kthread_data(current);
+ return NULL;
}
int nfsd_create_serv(struct net *net)
@@ -657,10 +652,18 @@ int nfsd_create_serv(struct net *net)
if (nn->nfsd_serv)
return 0;
+ error = percpu_ref_init(&nn->nfsd_serv_ref, nfsd_serv_free,
+ 0, GFP_KERNEL);
+ if (error)
+ return error;
+ init_completion(&nn->nfsd_serv_free_done);
+ init_completion(&nn->nfsd_serv_confirm_done);
+
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats,
+ serv = svc_create_pooled(nfsd_programs, ARRAY_SIZE(nfsd_programs),
+ &nn->nfsd_svcstats,
nfsd_max_blksize, nfsd);
if (serv == NULL)
return -ENOMEM;
@@ -705,7 +708,7 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
if (serv)
for (i = 0; i < serv->sv_nrpools && i < n; i++)
- nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads);
+ nthreads[i] = serv->sv_pools[i].sp_nrthreads;
return 0;
}
@@ -905,17 +908,17 @@ nfsd_init_request(struct svc_rqst *rqstp,
if (likely(nfsd_vers(nn, rqstp->rq_vers, NFSD_TEST)))
return svc_generic_init_request(rqstp, progp, ret);
- ret->mismatch.lovers = NFSD_NRVERS;
- for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+ ret->mismatch.lovers = NFSD_MAXVERS + 1;
+ for (i = NFSD_MINVERS; i <= NFSD_MAXVERS; i++) {
if (nfsd_vers(nn, i, NFSD_TEST)) {
ret->mismatch.lovers = i;
break;
}
}
- if (ret->mismatch.lovers == NFSD_NRVERS)
+ if (ret->mismatch.lovers > NFSD_MAXVERS)
return rpc_prog_unavail;
ret->mismatch.hivers = NFSD_MINVERS;
- for (i = NFSD_NRVERS - 1; i >= NFSD_MINVERS; i--) {
+ for (i = NFSD_MAXVERS; i >= NFSD_MINVERS; i--) {
if (nfsd_vers(nn, i, NFSD_TEST)) {
ret->mismatch.hivers = i;
break;
@@ -937,11 +940,9 @@ nfsd(void *vrqstp)
/* At this point, the thread shares current->fs
* with the init process. We need to create files with the
- * umask as defined by the client instead of init's umask. */
- if (unshare_fs_struct() < 0) {
- printk("Unable to start nfsd thread: out of memory\n");
- goto out;
- }
+ * umask as defined by the client instead of init's umask.
+ */
+ svc_thread_init_status(rqstp, unshare_fs_struct());
current->fs->umask = 0;
@@ -963,14 +964,13 @@ nfsd(void *vrqstp)
atomic_dec(&nfsd_th_cnt);
-out:
/* Release the thread */
svc_exit_thread(rqstp);
return 0;
}
/**
- * nfsd_dispatch - Process an NFS or NFSACL Request
+ * nfsd_dispatch - Process an NFS or NFSACL or LOCALIO Request
* @rqstp: incoming request
*
* This RPC dispatcher integrates the NFS server's duplicate reply cache.
@@ -1084,10 +1084,3 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
return true;
}
-
-int nfsd_pool_stats_open(struct inode *inode, struct file *file)
-{
- struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
-
- return svc_pool_stats_open(&nn->nfsd_info, file);
-}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ec4559ecd193..79c743c01a47 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -79,6 +79,7 @@ struct nfsd4_callback_ops {
void (*prepare)(struct nfsd4_callback *);
int (*done)(struct nfsd4_callback *, struct rpc_task *);
void (*release)(struct nfsd4_callback *);
+ uint32_t opcode;
};
/*
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 77bbd23aa150..c625966cfcf3 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -86,7 +86,8 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
{ NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \
{ NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \
{ NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \
- { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" })
+ { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }, \
+ { NFSD_MAY_LOCALIO, "LOCALIO" })
TRACE_EVENT(nfsd_compound,
TP_PROTO(
@@ -193,7 +194,7 @@ TRACE_EVENT(nfsd_compound_encode_err,
{ S_IFIFO, "FIFO" }, \
{ S_IFSOCK, "SOCK" })
-TRACE_EVENT(nfsd_fh_verify,
+TRACE_EVENT_CONDITION(nfsd_fh_verify,
TP_PROTO(
const struct svc_rqst *rqstp,
const struct svc_fh *fhp,
@@ -201,6 +202,7 @@ TRACE_EVENT(nfsd_fh_verify,
int access
),
TP_ARGS(rqstp, fhp, type, access),
+ TP_CONDITION(rqstp != NULL),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
@@ -239,7 +241,7 @@ TRACE_EVENT_CONDITION(nfsd_fh_verify_err,
__be32 error
),
TP_ARGS(rqstp, fhp, type, access, error),
- TP_CONDITION(error),
+ TP_CONDITION(rqstp != NULL && error),
TP_STRUCT__entry(
__field(unsigned int, netns_ino)
__sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
@@ -295,12 +297,13 @@ DECLARE_EVENT_CLASS(nfsd_fh_err_class,
__entry->status)
)
-#define DEFINE_NFSD_FH_ERR_EVENT(name) \
-DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \
- TP_PROTO(struct svc_rqst *rqstp, \
- struct svc_fh *fhp, \
- int status), \
- TP_ARGS(rqstp, fhp, status))
+#define DEFINE_NFSD_FH_ERR_EVENT(name) \
+DEFINE_EVENT_CONDITION(nfsd_fh_err_class, nfsd_##name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ int status), \
+ TP_ARGS(rqstp, fhp, status), \
+ TP_CONDITION(rqstp != NULL))
DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport);
DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle);
@@ -1486,6 +1489,9 @@ DEFINE_NFSD_CB_EVENT(new_state);
DEFINE_NFSD_CB_EVENT(probe);
DEFINE_NFSD_CB_EVENT(lost);
DEFINE_NFSD_CB_EVENT(shutdown);
+DEFINE_NFSD_CB_EVENT(rpc_prepare);
+DEFINE_NFSD_CB_EVENT(rpc_done);
+DEFINE_NFSD_CB_EVENT(rpc_release);
TRACE_DEFINE_ENUM(RPC_AUTH_NULL);
TRACE_DEFINE_ENUM(RPC_AUTH_UNIX);
@@ -1553,6 +1559,19 @@ TRACE_EVENT(nfsd_cb_setup_err,
__entry->error)
);
+/* Not a real opcode, but there is no 0 operation. */
+#define _CB_NULL 0
+
+#define show_nfsd_cb_opcode(val) \
+ __print_symbolic(val, \
+ { _CB_NULL, "CB_NULL" }, \
+ { OP_CB_GETATTR, "CB_GETATTR" }, \
+ { OP_CB_RECALL, "CB_RECALL" }, \
+ { OP_CB_LAYOUTRECALL, "CB_LAYOUTRECALL" }, \
+ { OP_CB_RECALL_ANY, "CB_RECALL_ANY" }, \
+ { OP_CB_NOTIFY_LOCK, "CB_NOTIFY_LOCK" }, \
+ { OP_CB_OFFLOAD, "CB_OFFLOAD" })
+
DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
TP_PROTO(
const struct nfs4_client *clp,
@@ -1563,6 +1582,7 @@ DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(const void *, cb)
+ __field(unsigned long, opcode)
__field(bool, need_restart)
__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
@@ -1570,14 +1590,15 @@ DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cb = cb;
+ __entry->opcode = cb->cb_ops ? cb->cb_ops->opcode : _CB_NULL;
__entry->need_restart = cb->cb_need_restart;
__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
clp->cl_cb_conn.cb_addrlen)
),
- TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
- __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
- __entry->cb, __entry->need_restart ?
- " (need restart)" : " (first try)"
+ TP_printk("addr=%pISpc client %08x:%08x cb=%p%s opcode=%s",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, __entry->cb,
+ __entry->need_restart ? " (need restart)" : " (first try)",
+ show_nfsd_cb_opcode(__entry->opcode)
)
);
@@ -1830,6 +1851,7 @@ DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done);
DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_getattr_done);
TRACE_EVENT(nfsd_cb_recall_any_done,
TP_PROTO(
@@ -2127,6 +2149,10 @@ DECLARE_EVENT_CLASS(nfsd_copy_class,
__field(u32, dst_cl_id)
__field(u32, dst_so_id)
__field(u32, dst_si_generation)
+ __field(u32, cb_cl_boot)
+ __field(u32, cb_cl_id)
+ __field(u32, cb_so_id)
+ __field(u32, cb_si_generation)
__field(u64, src_cp_pos)
__field(u64, dst_cp_pos)
__field(u64, cp_count)
@@ -2135,6 +2161,7 @@ DECLARE_EVENT_CLASS(nfsd_copy_class,
TP_fast_assign(
const stateid_t *src_stp = &copy->cp_src_stateid;
const stateid_t *dst_stp = &copy->cp_dst_stateid;
+ const stateid_t *cb_stp = &copy->cp_res.cb_stateid;
__entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
__entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
@@ -2146,6 +2173,10 @@ DECLARE_EVENT_CLASS(nfsd_copy_class,
__entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
__entry->dst_so_id = dst_stp->si_opaque.so_id;
__entry->dst_si_generation = dst_stp->si_generation;
+ __entry->cb_cl_boot = cb_stp->si_opaque.so_clid.cl_boot;
+ __entry->cb_cl_id = cb_stp->si_opaque.so_clid.cl_id;
+ __entry->cb_so_id = cb_stp->si_opaque.so_id;
+ __entry->cb_si_generation = cb_stp->si_generation;
__entry->src_cp_pos = copy->cp_src_pos;
__entry->dst_cp_pos = copy->cp_dst_pos;
__entry->cp_count = copy->cp_count;
@@ -2153,14 +2184,17 @@ DECLARE_EVENT_CLASS(nfsd_copy_class,
sizeof(struct sockaddr_in6));
),
TP_printk("client=%pISpc intra=%d async=%d "
- "src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
- "dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] "
+ "src_client %08x:%08x src_stateid %08x:%08x "
+ "dst_client %08x:%08x dst_stateid %08x:%08x "
+ "cb_client %08x:%08x cb_stateid %08x:%08x "
"cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
__get_sockaddr(addr), __entry->intra, __entry->async,
- __entry->src_si_generation, __entry->src_cl_boot,
- __entry->src_cl_id, __entry->src_so_id,
- __entry->dst_si_generation, __entry->dst_cl_boot,
- __entry->dst_cl_id, __entry->dst_so_id,
+ __entry->src_cl_boot, __entry->src_cl_id,
+ __entry->src_so_id, __entry->src_si_generation,
+ __entry->dst_cl_boot, __entry->dst_cl_id,
+ __entry->dst_so_id, __entry->dst_si_generation,
+ __entry->cb_cl_boot, __entry->cb_cl_id,
+ __entry->cb_so_id, __entry->cb_si_generation,
__entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
)
);
@@ -2172,7 +2206,7 @@ DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \
DEFINE_COPY_EVENT(inter);
DEFINE_COPY_EVENT(intra);
-DEFINE_COPY_EVENT(do_async);
+DEFINE_COPY_EVENT(async);
TRACE_EVENT(nfsd_copy_done,
TP_PROTO(
@@ -2193,11 +2227,80 @@ TRACE_EVENT(nfsd_copy_done,
__assign_sockaddr(addr, &copy->cp_clp->cl_addr,
sizeof(struct sockaddr_in6));
),
- TP_printk("addr=%pISpc status=%d intra=%d async=%d ",
+ TP_printk("addr=%pISpc status=%d intra=%d async=%d",
__get_sockaddr(addr), __entry->status, __entry->intra, __entry->async
)
);
+TRACE_EVENT(nfsd_copy_async_done,
+ TP_PROTO(
+ const struct nfsd4_copy *copy
+ ),
+ TP_ARGS(copy),
+ TP_STRUCT__entry(
+ __field(int, status)
+ __field(bool, intra)
+ __field(bool, async)
+ __field(u32, src_cl_boot)
+ __field(u32, src_cl_id)
+ __field(u32, src_so_id)
+ __field(u32, src_si_generation)
+ __field(u32, dst_cl_boot)
+ __field(u32, dst_cl_id)
+ __field(u32, dst_so_id)
+ __field(u32, dst_si_generation)
+ __field(u32, cb_cl_boot)
+ __field(u32, cb_cl_id)
+ __field(u32, cb_so_id)
+ __field(u32, cb_si_generation)
+ __field(u64, src_cp_pos)
+ __field(u64, dst_cp_pos)
+ __field(u64, cp_count)
+ __sockaddr(addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+ const stateid_t *src_stp = &copy->cp_src_stateid;
+ const stateid_t *dst_stp = &copy->cp_dst_stateid;
+ const stateid_t *cb_stp = &copy->cp_res.cb_stateid;
+
+ __entry->status = be32_to_cpu(copy->nfserr);
+ __entry->intra = test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
+ __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
+ __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot;
+ __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id;
+ __entry->src_so_id = src_stp->si_opaque.so_id;
+ __entry->src_si_generation = src_stp->si_generation;
+ __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot;
+ __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id;
+ __entry->dst_so_id = dst_stp->si_opaque.so_id;
+ __entry->dst_si_generation = dst_stp->si_generation;
+ __entry->cb_cl_boot = cb_stp->si_opaque.so_clid.cl_boot;
+ __entry->cb_cl_id = cb_stp->si_opaque.so_clid.cl_id;
+ __entry->cb_so_id = cb_stp->si_opaque.so_id;
+ __entry->cb_si_generation = cb_stp->si_generation;
+ __entry->src_cp_pos = copy->cp_src_pos;
+ __entry->dst_cp_pos = copy->cp_dst_pos;
+ __entry->cp_count = copy->cp_count;
+ __assign_sockaddr(addr, &copy->cp_clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+ TP_printk("client=%pISpc status=%d intra=%d async=%d "
+ "src_client %08x:%08x src_stateid %08x:%08x "
+ "dst_client %08x:%08x dst_stateid %08x:%08x "
+ "cb_client %08x:%08x cb_stateid %08x:%08x "
+ "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu",
+ __get_sockaddr(addr),
+ __entry->status, __entry->intra, __entry->async,
+ __entry->src_cl_boot, __entry->src_cl_id,
+ __entry->src_so_id, __entry->src_si_generation,
+ __entry->dst_cl_boot, __entry->dst_cl_id,
+ __entry->dst_so_id, __entry->dst_si_generation,
+ __entry->cb_cl_boot, __entry->cb_cl_id,
+ __entry->cb_so_id, __entry->cb_si_generation,
+ __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 29b1f3613800..22325b590e17 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -100,6 +100,7 @@ nfserrno (int errno)
{ nfserr_io, -EUCLEAN },
{ nfserr_perm, -ENOKEY },
{ nfserr_no_grace, -ENOGRACE},
+ { nfserr_io, -EBADMSG },
};
int i;
@@ -421,8 +422,9 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (iap->ia_size < inode->i_size) {
__be32 err;
- err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+ err = nfsd_permission(&rqstp->rq_cred,
+ fhp->fh_export, fhp->fh_dentry,
+ NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
if (err)
return err;
}
@@ -814,7 +816,8 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
sresult |= map->access;
- err2 = nfsd_permission(rqstp, export, dentry, map->how);
+ err2 = nfsd_permission(&rqstp->rq_cred, export,
+ dentry, map->how);
switch (err2) {
case nfs_ok:
result |= map->access;
@@ -1160,7 +1163,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
errseq_t since;
__be32 nfserr;
int host_err;
- int use_wgather;
loff_t pos = offset;
unsigned long exp_op_flags = 0;
unsigned int pflags = current->flags;
@@ -1186,12 +1188,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
}
exp = fhp->fh_export;
- use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
if (!EX_ISSYNC(exp))
stable = NFS_UNSTABLE;
- if (stable && !use_wgather)
+ if (stable && !fhp->fh_use_wgather)
flags |= RWF_SYNC;
iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
@@ -1210,7 +1211,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (host_err < 0)
goto out_nfserr;
- if (stable && use_wgather) {
+ if (stable && fhp->fh_use_wgather) {
host_err = wait_for_concurrent_writes(file);
if (host_err < 0)
commit_reset_write_verifier(nn, rqstp, host_err);
@@ -1475,7 +1476,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
dirp = d_inode(dentry);
dchild = dget(resfhp->fh_dentry);
- err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+ err = nfsd_permission(&rqstp->rq_cred, fhp->fh_export, dentry,
+ NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1767,10 +1769,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
if (!err)
err = nfserrno(commit_metadata(tfhp));
} else {
- if (host_err == -EXDEV && rqstp->rq_vers == 2)
- err = nfserr_acces;
- else
- err = nfserrno(host_err);
+ err = nfserrno(host_err);
}
dput(dnew);
out_drop_write:
@@ -1836,7 +1835,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
- err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ err = nfserr_xdev;
if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
goto out;
if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
@@ -1851,7 +1850,7 @@ retry:
trap = lock_rename(tdentry, fdentry);
if (IS_ERR(trap)) {
- err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ err = nfserr_xdev;
goto out_want_write;
}
err = fh_fill_pre_attrs(ffhp);
@@ -2020,10 +2019,7 @@ out_nfserr:
/* name is mounted-on. There is no perfect
* error status.
*/
- if (nfsd_v4client(rqstp))
- err = nfserr_file_open;
- else
- err = nfserr_acces;
+ err = nfserr_file_open;
} else {
err = nfserrno(host_err);
}
@@ -2178,8 +2174,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
loff_t offset = *offsetp;
int may_flags = NFSD_MAY_READ;
- /* NFSv2 only supports 32 bit cookies */
- if (rqstp->rq_vers > 2)
+ if (fhp->fh_64bit_cookies)
may_flags |= NFSD_MAY_64BIT_COOKIE;
err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
@@ -2255,9 +2250,9 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in
return err;
}
-static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp)
+static int exp_rdonly(struct svc_cred *cred, struct svc_export *exp)
{
- return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY;
+ return nfsexp_flags(cred, exp) & NFSEXP_READONLY;
}
#ifdef CONFIG_NFSD_V4
@@ -2501,8 +2496,8 @@ out_unlock:
* Check for a user's access permissions to this inode.
*/
__be32
-nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
- struct dentry *dentry, int acc)
+nfsd_permission(struct svc_cred *cred, struct svc_export *exp,
+ struct dentry *dentry, int acc)
{
struct inode *inode = d_inode(dentry);
int err;
@@ -2533,7 +2528,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
*/
if (!(acc & NFSD_MAY_LOCAL_ACCESS))
if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
- if (exp_rdonly(rqstp, exp) ||
+ if (exp_rdonly(cred, exp) ||
__mnt_is_readonly(exp->ex_path.mnt))
return nfserr_rofs;
if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 57cd70062048..3ff146522556 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -33,6 +33,8 @@
#define NFSD_MAY_64BIT_COOKIE 0x1000 /* 64 bit readdir cookies for >= NFSv3 */
+#define NFSD_MAY_LOCALIO 0x2000 /* for tracing, reflects when localio used */
+
#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -153,8 +155,8 @@ __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
-__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
- struct dentry *, int);
+__be32 nfsd_permission(struct svc_cred *cred, struct svc_export *exp,
+ struct dentry *dentry, int acc);
void nfsd_filp_close(struct file *fp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index fbdd42cde1fa..2a21a7662e03 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -713,6 +713,7 @@ struct nfsd4_copy {
struct nfsd4_ssc_umount_item *ss_nsui;
struct nfs_fh c_fh;
nfs4_stateid stateid;
+ struct nfsd_net *cp_nn;
};
static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync)
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index d825a9faca6d..e19d7eb10084 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -37,7 +37,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
/**
- * nilfs_palloc_req - persistent allocator request and reply
+ * struct nilfs_palloc_req - persistent allocator request and reply
* @pr_entry_nr: entry number (vblocknr or inode number)
* @pr_desc_bh: buffer head of the buffer containing block group descriptors
* @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index cd14ea25968c..c9e8d9a7d820 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -349,7 +349,7 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
}
/**
- * nilfs_bmap_lookup_dirty_buffers -
+ * nilfs_bmap_lookup_dirty_buffers - collect dirty block buffers
* @bmap: bmap
* @listp: pointer to buffer head list
*/
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 608168a5cb88..4656df392722 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -44,6 +44,19 @@ struct nilfs_bmap_stats {
/**
* struct nilfs_bmap_operations - bmap operation table
+ * @bop_lookup: single block search operation
+ * @bop_lookup_contig: consecutive block search operation
+ * @bop_insert: block insertion operation
+ * @bop_delete: block delete operation
+ * @bop_clear: block mapping resource release operation
+ * @bop_propagate: operation to propagate dirty state towards the
+ * mapping root
+ * @bop_lookup_dirty_buffers: operation to collect dirty block buffers
+ * @bop_assign: disk block address assignment operation
+ * @bop_mark: operation to mark in-use blocks as dirty for
+ * relocation by GC
+ * @bop_seek_key: find valid block key operation
+ * @bop_last_key: find last valid block key operation
*/
struct nilfs_bmap_operations {
int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
@@ -66,7 +79,7 @@ struct nilfs_bmap_operations {
int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
- /* The following functions are internal use only. */
+ /* private: internal use only */
int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
int (*bop_check_delete)(struct nilfs_bmap *, __u64);
int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
@@ -74,9 +87,8 @@ struct nilfs_bmap_operations {
#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
-#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
-#define NILFS_BMAP_NEW_PTR_INIT \
- (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+#define NILFS_BMAP_KEY_BIT BITS_PER_LONG
+#define NILFS_BMAP_NEW_PTR_INIT (1UL << (BITS_PER_LONG - 1))
static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
{
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c034080c334b..57b4af5ad646 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -179,11 +179,32 @@ void nilfs_btnode_delete(struct buffer_head *bh)
}
/**
- * nilfs_btnode_prepare_change_key
- * prepare to move contents of the block for old key to one of new key.
- * the old buffer will not be removed, but might be reused for new buffer.
- * it might return -ENOMEM because of memory allocation errors,
- * and might return -EIO because of disk read errors.
+ * nilfs_btnode_prepare_change_key - prepare to change the search key of a
+ * b-tree node block
+ * @btnc: page cache in which the b-tree node block is buffered
+ * @ctxt: structure for exchanging context information for key change
+ *
+ * nilfs_btnode_prepare_change_key() prepares to move the contents of the
+ * b-tree node block of the old key given in the "oldkey" member of @ctxt to
+ * the position of the new key given in the "newkey" member of @ctxt in the
+ * page cache @btnc. Here, the key of the block is an index in units of
+ * blocks, and if the page and block sizes match, it matches the page index
+ * in the page cache.
+ *
+ * If the page size and block size match, this function attempts to move the
+ * entire folio, and in preparation for this, inserts the original folio into
+ * the new index of the cache. If this insertion fails or if the page size
+ * and block size are different, it falls back to a copy preparation using
+ * nilfs_btnode_create_block(), inserts a new block at the position
+ * corresponding to "newkey", and stores the buffer head pointer in the
+ * "newbh" member of @ctxt.
+ *
+ * Note that the current implementation does not support folio sizes larger
+ * than the page size.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EIO - I/O error (metadata corruption).
+ * * %-ENOMEM - Insufficient memory available.
*/
int nilfs_btnode_prepare_change_key(struct address_space *btnc,
struct nilfs_btnode_chkey_ctxt *ctxt)
@@ -245,8 +266,21 @@ retry:
}
/**
- * nilfs_btnode_commit_change_key
- * commit the change_key operation prepared by prepare_change_key().
+ * nilfs_btnode_commit_change_key - commit the change of the search key of
+ * a b-tree node block
+ * @btnc: page cache in which the b-tree node block is buffered
+ * @ctxt: structure for exchanging context information for key change
+ *
+ * nilfs_btnode_commit_change_key() executes the key change based on the
+ * context @ctxt prepared by nilfs_btnode_prepare_change_key(). If no valid
+ * block buffer is prepared in "newbh" of @ctxt (i.e., a full folio move),
+ * this function removes the folio from the old index and completes the move.
+ * Otherwise, it copies the block data and inherited flag states of "oldbh"
+ * to "newbh" and clears the "oldbh" from the cache. In either case, the
+ * relocated buffer is marked as dirty.
+ *
+ * As with nilfs_btnode_prepare_change_key(), the current implementation does
+ * not support folio sizes larger than the page size.
*/
void nilfs_btnode_commit_change_key(struct address_space *btnc,
struct nilfs_btnode_chkey_ctxt *ctxt)
@@ -285,8 +319,19 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
}
/**
- * nilfs_btnode_abort_change_key
- * abort the change_key operation prepared by prepare_change_key().
+ * nilfs_btnode_abort_change_key - abort the change of the search key of a
+ * b-tree node block
+ * @btnc: page cache in which the b-tree node block is buffered
+ * @ctxt: structure for exchanging context information for key change
+ *
+ * nilfs_btnode_abort_change_key() cancels the key change associated with the
+ * context @ctxt prepared via nilfs_btnode_prepare_change_key() and performs
+ * any necessary cleanup. If no valid block buffer is prepared in "newbh" of
+ * @ctxt, this function removes the folio from the destination index and aborts
+ * the move. Otherwise, it clears "newbh" from the cache.
+ *
+ * As with nilfs_btnode_prepare_change_key(), the current implementation does
+ * not support folio sizes larger than the page size.
*/
void nilfs_btnode_abort_change_key(struct address_space *btnc,
struct nilfs_btnode_chkey_ctxt *ctxt)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 862bdf23120e..ef5061bb56da 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -350,7 +350,7 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX ||
(flags & NILFS_BTREE_NODE_ROOT) ||
- nchildren < 0 ||
+ nchildren <= 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
nilfs_crit(inode->i_sb,
"bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
@@ -381,7 +381,8 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
- nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+ nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX ||
+ (nchildren == 0 && level > NILFS_BTREE_LEVEL_NODE_MIN))) {
nilfs_crit(inode->i_sb,
"bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
inode->i_ino, level, flags, nchildren);
@@ -1658,13 +1659,16 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
int nchildren, ret;
root = nilfs_btree_get_root(btree);
+ nchildren = nilfs_btree_node_get_nchildren(root);
+ if (unlikely(nchildren == 0))
+ return 0;
+
switch (nilfs_btree_height(btree)) {
case 2:
bh = NULL;
node = root;
break;
case 3:
- nchildren = nilfs_btree_node_get_nchildren(root);
if (nchildren > 1)
return 0;
ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
@@ -1673,12 +1677,12 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
if (ret < 0)
return ret;
node = (struct nilfs_btree_node *)bh->b_data;
+ nchildren = nilfs_btree_node_get_nchildren(node);
break;
default:
return 0;
}
- nchildren = nilfs_btree_node_get_nchildren(node);
maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
nextmaxkey = (nchildren > 1) ?
nilfs_btree_node_get_key(node, nchildren - 2) : 0;
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 92868e1a48ca..2a220f716c91 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -24,6 +24,7 @@
* @bp_index: index of child node
* @bp_oldreq: ptr end request for old ptr
* @bp_newreq: ptr alloc request for new ptr
+ * @bp_ctxt: context information for changing the key of a b-tree node block
* @bp_op: rebalance operation
*/
struct nilfs_btree_path {
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 69a5cced1e84..f0ce37552446 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -125,10 +125,17 @@ static void nilfs_cpfile_block_init(struct inode *cpfile,
}
}
-static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
- struct buffer_head **bhp)
+static int nilfs_cpfile_get_header_block(struct inode *cpfile,
+ struct buffer_head **bhp)
{
- return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+ int err = nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+
+ if (unlikely(err == -ENOENT)) {
+ nilfs_error(cpfile->i_sb,
+ "missing header block in checkpoint metadata");
+ err = -EIO;
+ }
+ return err;
}
static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
@@ -283,14 +290,9 @@ int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
down_write(&NILFS_MDT(cpfile)->mi_sem);
ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
- if (unlikely(ret < 0)) {
- if (ret == -ENOENT) {
- nilfs_error(cpfile->i_sb,
- "checkpoint creation failed due to metadata corruption.");
- ret = -EIO;
- }
+ if (unlikely(ret < 0))
goto out_sem;
- }
+
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
if (unlikely(ret < 0))
goto out_header;
@@ -704,9 +706,15 @@ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
}
/**
- * nilfs_cpfile_delete_checkpoint -
- * @cpfile:
- * @cno:
+ * nilfs_cpfile_delete_checkpoint - delete a checkpoint
+ * @cpfile: checkpoint file inode
+ * @cno: checkpoint number to delete
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EBUSY - Checkpoint in use (snapshot specified).
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOENT - No valid checkpoint found.
+ * * %-ENOMEM - Insufficient memory available.
*/
int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
{
@@ -968,21 +976,15 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
}
/**
- * nilfs_cpfile_is_snapshot -
+ * nilfs_cpfile_is_snapshot - determine if checkpoint is a snapshot
* @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- *
- * Description:
- *
- * Return Value: On success, 1 is returned if the checkpoint specified by
- * @cno is a snapshot, or 0 if not. On error, one of the following negative
- * error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * @cno: checkpoint number
*
- * %-ENOENT - No such checkpoint.
+ * Return: 1 if the checkpoint specified by @cno is a snapshot, 0 if not, or
+ * the following negative error code on failure.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOENT - No such checkpoint.
+ * * %-ENOMEM - Insufficient memory available.
*/
int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
{
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fc1caf63a42a..0bef662176a4 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -271,18 +271,15 @@ void nilfs_dat_abort_update(struct inode *dat,
}
/**
- * nilfs_dat_mark_dirty -
- * @dat: DAT file inode
+ * nilfs_dat_mark_dirty - mark the DAT block buffer containing the specified
+ * virtual block address entry as dirty
+ * @dat: DAT file inode
* @vblocknr: virtual block number
*
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid DAT entry (internal code).
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOMEM - Insufficient memory available.
*/
int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
{
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 4b3e19d74925..fe5b1a30c509 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -231,37 +231,6 @@ static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
nilfs_rec_len_from_disk(p->rec_len));
}
-static unsigned char
-nilfs_filetype_table[NILFS_FT_MAX] = {
- [NILFS_FT_UNKNOWN] = DT_UNKNOWN,
- [NILFS_FT_REG_FILE] = DT_REG,
- [NILFS_FT_DIR] = DT_DIR,
- [NILFS_FT_CHRDEV] = DT_CHR,
- [NILFS_FT_BLKDEV] = DT_BLK,
- [NILFS_FT_FIFO] = DT_FIFO,
- [NILFS_FT_SOCK] = DT_SOCK,
- [NILFS_FT_SYMLINK] = DT_LNK,
-};
-
-#define S_SHIFT 12
-static unsigned char
-nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
- [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
- [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK,
-};
-
-static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
-{
- umode_t mode = inode->i_mode;
-
- de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
static int nilfs_readdir(struct file *file, struct dir_context *ctx)
{
loff_t pos = ctx->pos;
@@ -297,10 +266,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
if (de->inode) {
unsigned char t;
- if (de->file_type < NILFS_FT_MAX)
- t = nilfs_filetype_table[de->file_type];
- else
- t = DT_UNKNOWN;
+ t = fs_ftype_to_dtype(de->file_type);
if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode), t)) {
@@ -444,7 +410,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
err = nilfs_prepare_chunk(folio, from, to);
BUG_ON(err);
de->inode = cpu_to_le64(inode->i_ino);
- nilfs_set_de_type(de, inode);
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
nilfs_commit_chunk(folio, mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
@@ -531,7 +497,7 @@ got_it:
de->name_len = namelen;
memcpy(de->name, name, namelen);
de->inode = cpu_to_le64(inode->i_ino);
- nilfs_set_de_type(de, inode);
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
nilfs_commit_chunk(folio, folio->mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
nilfs_mark_inode_dirty(dir);
@@ -612,14 +578,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
memcpy(de->name, ".\0\0", 4);
de->inode = cpu_to_le64(inode->i_ino);
- nilfs_set_de_type(de, inode);
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
de->name_len = 2;
de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
de->inode = cpu_to_le64(parent->i_ino);
memcpy(de->name, "..\0", 4);
- nilfs_set_de_type(de, inode);
+ de->file_type = fs_umode_to_ftype(inode->i_mode);
kunmap_local(kaddr);
nilfs_commit_chunk(folio, mapping, 0, chunk_size);
fail:
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8661f452dba6..be6acf6e2bfc 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/fiemap.h>
+#include <linux/random.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
@@ -28,17 +29,13 @@
* @ino: inode number
* @cno: checkpoint number
* @root: pointer on NILFS root object (mounted checkpoint)
- * @for_gc: inode for GC flag
- * @for_btnc: inode for B-tree node cache flag
- * @for_shadow: inode for shadowed page cache flag
+ * @type: inode type
*/
struct nilfs_iget_args {
u64 ino;
__u64 cno;
struct nilfs_root *root;
- bool for_gc;
- bool for_btnc;
- bool for_shadow;
+ unsigned int type;
};
static int nilfs_iget_test(struct inode *inode, void *opaque);
@@ -162,7 +159,7 @@ static int nilfs_writepages(struct address_space *mapping,
int err = 0;
if (sb_rdonly(inode->i_sb)) {
- nilfs_clear_dirty_pages(mapping, false);
+ nilfs_clear_dirty_pages(mapping);
return -EROFS;
}
@@ -186,7 +183,7 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
* have dirty pages that try to be flushed in background.
* So, here we simply discard this dirty page.
*/
- nilfs_clear_folio_dirty(folio, false);
+ nilfs_clear_folio_dirty(folio);
folio_unlock(folio);
return -EROFS;
}
@@ -315,8 +312,7 @@ static int nilfs_insert_inode_locked(struct inode *inode,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = false,
- .for_btnc = false, .for_shadow = false
+ .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
};
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
@@ -325,7 +321,6 @@ static int nilfs_insert_inode_locked(struct inode *inode,
struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
- struct the_nilfs *nilfs = sb->s_fs_info;
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
@@ -343,25 +338,13 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
ii->i_state = BIT(NILFS_I_NEW);
+ ii->i_type = NILFS_I_TYPE_NORMAL;
ii->i_root = root;
err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
-
- if (unlikely(ino < NILFS_USER_INO)) {
- nilfs_warn(sb,
- "inode bitmap is inconsistent for reserved inodes");
- do {
- brelse(bh);
- err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
- if (unlikely(err))
- goto failed_ifile_create_inode;
- } while (ino < NILFS_USER_INO);
-
- nilfs_info(sb, "repaired inode bitmap for reserved inodes");
- }
ii->i_bh = bh;
atomic64_inc(&root->inodes_count);
@@ -385,9 +368,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
/* ii->i_dir_acl = 0; */
ii->i_dir_start_lookup = 0;
nilfs_set_inode_flags(inode);
- spin_lock(&nilfs->ns_next_gen_lock);
- inode->i_generation = nilfs->ns_next_generation++;
- spin_unlock(&nilfs->ns_next_gen_lock);
+ inode->i_generation = get_random_u32();
if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
err = -EIO;
goto failed_after_creation;
@@ -546,23 +527,10 @@ static int nilfs_iget_test(struct inode *inode, void *opaque)
return 0;
ii = NILFS_I(inode);
- if (test_bit(NILFS_I_BTNC, &ii->i_state)) {
- if (!args->for_btnc)
- return 0;
- } else if (args->for_btnc) {
+ if (ii->i_type != args->type)
return 0;
- }
- if (test_bit(NILFS_I_SHADOW, &ii->i_state)) {
- if (!args->for_shadow)
- return 0;
- } else if (args->for_shadow) {
- return 0;
- }
- if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
- return !args->for_gc;
-
- return args->for_gc && args->cno == ii->i_cno;
+ return !(args->type & NILFS_I_TYPE_GC) || args->cno == ii->i_cno;
}
static int nilfs_iget_set(struct inode *inode, void *opaque)
@@ -572,15 +540,9 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
inode->i_ino = args->ino;
NILFS_I(inode)->i_cno = args->cno;
NILFS_I(inode)->i_root = args->root;
+ NILFS_I(inode)->i_type = args->type;
if (args->root && args->ino == NILFS_ROOT_INO)
nilfs_get_root(args->root);
-
- if (args->for_gc)
- NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
- if (args->for_btnc)
- NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC);
- if (args->for_shadow)
- NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW);
return 0;
}
@@ -588,8 +550,7 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = false,
- .for_btnc = false, .for_shadow = false
+ .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
};
return ilookup5(sb, ino, nilfs_iget_test, &args);
@@ -599,8 +560,7 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = false,
- .for_btnc = false, .for_shadow = false
+ .ino = ino, .root = root, .cno = 0, .type = NILFS_I_TYPE_NORMAL
};
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
@@ -631,8 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
__u64 cno)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = NULL, .cno = cno, .for_gc = true,
- .for_btnc = false, .for_shadow = false
+ .ino = ino, .root = NULL, .cno = cno, .type = NILFS_I_TYPE_GC
};
struct inode *inode;
int err;
@@ -677,9 +636,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode)
args.ino = inode->i_ino;
args.root = ii->i_root;
args.cno = ii->i_cno;
- args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0;
- args.for_btnc = true;
- args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
+ args.type = ii->i_type | NILFS_I_TYPE_BTNC;
btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
nilfs_iget_set, &args);
@@ -733,8 +690,8 @@ void nilfs_detach_btree_node_cache(struct inode *inode)
struct inode *nilfs_iget_for_shadow(struct inode *inode)
{
struct nilfs_iget_args args = {
- .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false,
- .for_btnc = false, .for_shadow = true
+ .ino = inode->i_ino, .root = NULL, .cno = 0,
+ .type = NILFS_I_TYPE_SHADOW
};
struct inode *s_inode;
int err;
@@ -900,7 +857,7 @@ static void nilfs_clear_inode(struct inode *inode)
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
- if (!test_bit(NILFS_I_BTNC, &ii->i_state))
+ if (!(ii->i_type & NILFS_I_TYPE_BTNC))
nilfs_detach_btree_node_cache(inode);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 8be471ce4f19..fa77f78df681 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -17,6 +17,7 @@
#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
#include <linux/fileattr.h>
+#include <linux/string.h>
#include "nilfs.h"
#include "segment.h"
#include "bmap.h"
@@ -114,7 +115,11 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
}
/**
- * nilfs_fileattr_get - ioctl to support lsattr
+ * nilfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry: the object to retrieve from
+ * @fa: fileattr pointer
+ *
+ * Return: always 0 as success.
*/
int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
@@ -126,7 +131,12 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
/**
- * nilfs_fileattr_set - ioctl to support chattr
+ * nilfs_fileattr_set - change miscellaneous file attributes
+ * @idmap: idmap of the mount
+ * @dentry: the object to change
+ * @fa: fileattr pointer
+ *
+ * Return: 0 on success, or a negative error code on failure.
*/
int nilfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa)
@@ -159,6 +169,10 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap,
/**
* nilfs_ioctl_getversion - get info about a file's version (generation number)
+ * @inode: inode object
+ * @argp: userspace memory where the generation number of @inode is stored
+ *
+ * Return: 0 on success, or %-EFAULT on error.
*/
static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
{
@@ -1266,6 +1280,91 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_get_fslabel - get the volume name of the file system
+ * @sb: super block instance
+ * @argp: pointer to userspace memory where the volume name should be stored
+ *
+ * Return: 0 on success, %-EFAULT if copying to userspace memory fails.
+ */
+static int nilfs_ioctl_get_fslabel(struct super_block *sb, void __user *argp)
+{
+ struct the_nilfs *nilfs = sb->s_fs_info;
+ char label[NILFS_MAX_VOLUME_NAME + 1];
+
+ BUILD_BUG_ON(NILFS_MAX_VOLUME_NAME >= FSLABEL_MAX);
+
+ down_read(&nilfs->ns_sem);
+ memtostr_pad(label, nilfs->ns_sbp[0]->s_volume_name);
+ up_read(&nilfs->ns_sem);
+
+ if (copy_to_user(argp, label, sizeof(label)))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * nilfs_ioctl_set_fslabel - set the volume name of the file system
+ * @sb: super block instance
+ * @filp: file object
+ * @argp: pointer to userspace memory that contains the volume name
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EFAULT - Error copying input data.
+ * * %-EINVAL - Label length exceeds record size in superblock.
+ * * %-EIO - I/O error.
+ * * %-EPERM - Operation not permitted (insufficient permissions).
+ * * %-EROFS - Read only file system.
+ */
+static int nilfs_ioctl_set_fslabel(struct super_block *sb, struct file *filp,
+ void __user *argp)
+{
+ char label[NILFS_MAX_VOLUME_NAME + 1];
+ struct the_nilfs *nilfs = sb->s_fs_info;
+ struct nilfs_super_block **sbp;
+ size_t len;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(label, argp, NILFS_MAX_VOLUME_NAME + 1)) {
+ ret = -EFAULT;
+ goto out_drop_write;
+ }
+
+ len = strnlen(label, NILFS_MAX_VOLUME_NAME + 1);
+ if (len > NILFS_MAX_VOLUME_NAME) {
+ nilfs_err(sb, "unable to set label with more than %zu bytes",
+ NILFS_MAX_VOLUME_NAME);
+ ret = -EINVAL;
+ goto out_drop_write;
+ }
+
+ down_write(&nilfs->ns_sem);
+ sbp = nilfs_prepare_super(sb, false);
+ if (unlikely(!sbp)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ strtomem_pad(sbp[0]->s_volume_name, label, 0);
+ if (sbp[1])
+ strtomem_pad(sbp[1]->s_volume_name, label, 0);
+
+ ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+
+out_unlock:
+ up_write(&nilfs->ns_sem);
+out_drop_write:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1308,6 +1407,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_set_alloc_range(inode, argp);
case FITRIM:
return nilfs_ioctl_trim_fs(inode, argp);
+ case FS_IOC_GETFSLABEL:
+ return nilfs_ioctl_get_fslabel(inode->i_sb, argp);
+ case FS_IOC_SETFSLABEL:
+ return nilfs_ioctl_set_fslabel(inode->i_sb, filp, argp);
default:
return -ENOTTY;
}
@@ -1334,6 +1437,8 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
case FITRIM:
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 4f792a0ad0f0..ceb7dc0b5bad 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -411,7 +411,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
* have dirty folios that try to be flushed in background.
* So, here we simply discard this dirty folio.
*/
- nilfs_clear_folio_dirty(folio, false);
+ nilfs_clear_folio_dirty(folio);
folio_unlock(folio);
return -EROFS;
}
@@ -638,10 +638,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
if (mi->mi_palloc_cache)
nilfs_palloc_clear_cache(inode);
- nilfs_clear_dirty_pages(inode->i_mapping, true);
+ nilfs_clear_dirty_pages(inode->i_mapping);
nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
- nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true);
+ nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping);
nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4017f7856440..fb1c4c5bae7c 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -22,6 +22,7 @@
/**
* struct nilfs_inode_info - nilfs inode data in memory
* @i_flags: inode flags
+ * @i_type: inode type (combination of flags that inidicate usage)
* @i_state: dynamic state flags
* @i_bmap: pointer on i_bmap_data
* @i_bmap_data: raw block mapping
@@ -37,6 +38,7 @@
*/
struct nilfs_inode_info {
__u32 i_flags;
+ unsigned int i_type;
unsigned long i_state; /* Dynamic state flags */
struct nilfs_bmap *i_bmap;
struct nilfs_bmap i_bmap_data;
@@ -90,9 +92,16 @@ enum {
NILFS_I_UPDATED, /* The file has been written back */
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
- NILFS_I_GCINODE, /* inode for GC, on memory only */
- NILFS_I_BTNC, /* inode for btree node cache */
- NILFS_I_SHADOW, /* inode for shadowed page cache */
+};
+
+/*
+ * Flags to identify the usage of on-memory inodes (i_type)
+ */
+enum {
+ NILFS_I_TYPE_NORMAL = 0,
+ NILFS_I_TYPE_GC = 0x0001, /* For data caching during GC */
+ NILFS_I_TYPE_BTNC = 0x0002, /* For btree node cache */
+ NILFS_I_TYPE_SHADOW = 0x0004, /* For shadowed page cache */
};
/*
@@ -103,6 +112,18 @@ enum {
NILFS_SB_COMMIT_ALL /* Commit both super blocks */
};
+/**
+ * define NILFS_MAX_VOLUME_NAME - maximum number of characters (bytes) in a
+ * file system volume name
+ *
+ * Defined by the size of the volume name field in the on-disk superblocks.
+ * This volume name does not include the terminating NULL byte if the string
+ * length matches the field size, so use (NILFS_MAX_VOLUME_NAME + 1) for the
+ * size of the buffer that requires a NULL byte termination.
+ */
+#define NILFS_MAX_VOLUME_NAME \
+ sizeof_field(struct nilfs_super_block, s_volume_name)
+
/*
* Macros to check inode numbers
*/
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 14e470fb8870..9c0b7cddeaae 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -262,7 +262,7 @@ repeat:
NILFS_FOLIO_BUG(folio, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
- if (unlikely(IS_ERR(dfolio))) {
+ if (IS_ERR(dfolio)) {
/* No empty page is added to the page cache */
folio_unlock(folio);
err = PTR_ERR(dfolio);
@@ -357,9 +357,8 @@ repeat:
/**
* nilfs_clear_dirty_pages - discard dirty pages in address space
* @mapping: address space with dirty pages for discarding
- * @silent: suppress [true] or print [false] warning messages
*/
-void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
+void nilfs_clear_dirty_pages(struct address_space *mapping)
{
struct folio_batch fbatch;
unsigned int i;
@@ -380,7 +379,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
* was acquired. Skip processing in that case.
*/
if (likely(folio->mapping == mapping))
- nilfs_clear_folio_dirty(folio, silent);
+ nilfs_clear_folio_dirty(folio);
folio_unlock(folio);
}
@@ -392,20 +391,13 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
/**
* nilfs_clear_folio_dirty - discard dirty folio
* @folio: dirty folio that will be discarded
- * @silent: suppress [true] or print [false] warning messages
*/
-void nilfs_clear_folio_dirty(struct folio *folio, bool silent)
+void nilfs_clear_folio_dirty(struct folio *folio)
{
- struct inode *inode = folio->mapping->host;
- struct super_block *sb = inode->i_sb;
struct buffer_head *bh, *head;
BUG_ON(!folio_test_locked(folio));
- if (!silent)
- nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
- folio_pos(folio), inode->i_ino);
-
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
@@ -419,11 +411,6 @@ void nilfs_clear_folio_dirty(struct folio *folio, bool silent)
bh = head;
do {
lock_buffer(bh);
- if (!silent)
- nilfs_warn(sb,
- "discard dirty block: blocknr=%llu, size=%zu",
- (u64)bh->b_blocknr, bh->b_size);
-
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 7e1a2c455a10..64521a03a19e 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -41,8 +41,8 @@ void nilfs_folio_bug(struct folio *);
int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
-void nilfs_clear_folio_dirty(struct folio *, bool);
-void nilfs_clear_dirty_pages(struct address_space *, bool);
+void nilfs_clear_folio_dirty(struct folio *folio);
+void nilfs_clear_dirty_pages(struct address_space *mapping);
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
unsigned int);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ec61ce9f29a2..21d81097a89f 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -433,8 +433,17 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
* The next segment is invalidated by this recovery.
*/
err = nilfs_sufile_free(sufile, segnum[1]);
- if (unlikely(err))
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ nilfs_err(sb,
+ "checkpoint log inconsistency at block %llu (segment %llu): next segment %llu is unallocated",
+ (unsigned long long)nilfs->ns_last_pseg,
+ (unsigned long long)nilfs->ns_segnum,
+ (unsigned long long)segnum[1]);
+ err = -EINVAL;
+ }
goto failed;
+ }
for (i = 1; i < 4; i++) {
err = nilfs_segment_list_add(head, segnum[i]);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 871ec35ea8e8..587251830897 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -519,7 +519,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
ii = NILFS_I(inode);
- if (test_bit(NILFS_I_GCINODE, &ii->i_state))
+ if (ii->i_type & NILFS_I_TYPE_GC)
cno = ii->i_cno;
else if (NILFS_ROOT_METADATA_FILE(inode->i_ino))
cno = 0;
@@ -1102,12 +1102,64 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
return err;
}
+/**
+ * nilfs_free_segments - free the segments given by an array of segment numbers
+ * @nilfs: nilfs object
+ * @segnumv: array of segment numbers to be freed
+ * @nsegs: number of segments to be freed in @segnumv
+ *
+ * nilfs_free_segments() wraps nilfs_sufile_freev() and
+ * nilfs_sufile_cancel_freev(), and edits the segment usage metadata file
+ * (sufile) to free all segments given by @segnumv and @nsegs at once. If
+ * it fails midway, it cancels the changes so that none of the segments are
+ * freed. If @nsegs is 0, this function does nothing.
+ *
+ * The freeing of segments is not finalized until the writing of a log with
+ * a super root block containing this sufile change is complete, and it can
+ * be canceled with nilfs_sufile_cancel_freev() until then.
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid segment number.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOMEM - Insufficient memory available.
+ */
+static int nilfs_free_segments(struct the_nilfs *nilfs, __u64 *segnumv,
+ size_t nsegs)
+{
+ size_t ndone;
+ int ret;
+
+ if (!nsegs)
+ return 0;
+
+ ret = nilfs_sufile_freev(nilfs->ns_sufile, segnumv, nsegs, &ndone);
+ if (unlikely(ret)) {
+ nilfs_sufile_cancel_freev(nilfs->ns_sufile, segnumv, ndone,
+ NULL);
+ /*
+ * If a segment usage of the segments to be freed is in a
+ * hole block, nilfs_sufile_freev() will return -ENOENT.
+ * In this case, -EINVAL should be returned to the caller
+ * since there is something wrong with the given segment
+ * number array. This error can only occur during GC, so
+ * there is no need to worry about it propagating to other
+ * callers (such as fsync).
+ */
+ if (ret == -ENOENT) {
+ nilfs_err(nilfs->ns_sb,
+ "The segment usage entry %llu to be freed is invalid (in a hole)",
+ (unsigned long long)segnumv[ndone]);
+ ret = -EINVAL;
+ }
+ }
+ return ret;
+}
+
static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
{
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
struct list_head *head;
struct nilfs_inode_info *ii;
- size_t ndone;
int err = 0;
switch (nilfs_sc_cstage_get(sci)) {
@@ -1201,14 +1253,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
nilfs_sc_cstage_inc(sci);
fallthrough;
case NILFS_ST_SUFILE:
- err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
- sci->sc_nfreesegs, &ndone);
- if (unlikely(err)) {
- nilfs_sufile_cancel_freev(nilfs->ns_sufile,
- sci->sc_freesegs, ndone,
- NULL);
+ err = nilfs_free_segments(nilfs, sci->sc_freesegs,
+ sci->sc_nfreesegs);
+ if (unlikely(err))
break;
- }
sci->sc_stage.flags |= NILFS_CF_SUFREED;
err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
@@ -2456,7 +2504,7 @@ static void nilfs_construction_timeout(struct timer_list *t)
{
struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
- wake_up_process(sci->sc_timer_task);
+ wake_up_process(sci->sc_task);
}
static void
@@ -2582,123 +2630,85 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
}
/**
- * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * nilfs_log_write_required - determine whether log writing is required
+ * @sci: nilfs_sc_info struct
+ * @modep: location for storing log writing mode
+ *
+ * Return: true if log writing is required, false otherwise. If log writing
+ * is required, the mode is stored in the location pointed to by @modep.
+ */
+static bool nilfs_log_write_required(struct nilfs_sc_info *sci, int *modep)
+{
+ bool timedout, ret = true;
+
+ spin_lock(&sci->sc_state_lock);
+ timedout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+ time_after_eq(jiffies, sci->sc_timer.expires));
+ if (timedout || sci->sc_seq_request != sci->sc_seq_done)
+ *modep = SC_LSEG_SR;
+ else if (sci->sc_flush_request)
+ *modep = nilfs_segctor_flush_mode(sci);
+ else
+ ret = false;
+
+ spin_unlock(&sci->sc_state_lock);
+ return ret;
+}
+
+/**
+ * nilfs_segctor_thread - main loop of the log writer thread
* @arg: pointer to a struct nilfs_sc_info.
*
- * nilfs_segctor_thread() initializes a timer and serves as a daemon
- * to execute segment constructions.
+ * nilfs_segctor_thread() is the main loop function of the log writer kernel
+ * thread, which determines whether log writing is necessary, and if so,
+ * performs the log write in the background, or waits if not. It is also
+ * used to decide the background writeback of the superblock.
+ *
+ * Return: Always 0.
*/
static int nilfs_segctor_thread(void *arg)
{
struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- int timeout = 0;
-
- sci->sc_timer_task = current;
- timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
- /* start sync. */
- sci->sc_task = current;
- wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
nilfs_info(sci->sc_super,
"segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
set_freezable();
- spin_lock(&sci->sc_state_lock);
- loop:
- for (;;) {
- int mode;
-
- if (sci->sc_state & NILFS_SEGCTOR_QUIT)
- goto end_thread;
-
- if (timeout || sci->sc_seq_request != sci->sc_seq_done)
- mode = SC_LSEG_SR;
- else if (sci->sc_flush_request)
- mode = nilfs_segctor_flush_mode(sci);
- else
- break;
-
- spin_unlock(&sci->sc_state_lock);
- nilfs_segctor_thread_construct(sci, mode);
- spin_lock(&sci->sc_state_lock);
- timeout = 0;
- }
-
- if (freezing(current)) {
- spin_unlock(&sci->sc_state_lock);
- try_to_freeze();
- spin_lock(&sci->sc_state_lock);
- } else {
+ while (!kthread_should_stop()) {
DEFINE_WAIT(wait);
- int should_sleep = 1;
+ bool should_write;
+ int mode;
+
+ if (freezing(current)) {
+ try_to_freeze();
+ continue;
+ }
prepare_to_wait(&sci->sc_wait_daemon, &wait,
TASK_INTERRUPTIBLE);
-
- if (sci->sc_seq_request != sci->sc_seq_done)
- should_sleep = 0;
- else if (sci->sc_flush_request)
- should_sleep = 0;
- else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
- should_sleep = time_before(jiffies,
- sci->sc_timer.expires);
-
- if (should_sleep) {
- spin_unlock(&sci->sc_state_lock);
+ should_write = nilfs_log_write_required(sci, &mode);
+ if (!should_write)
schedule();
- spin_lock(&sci->sc_state_lock);
- }
finish_wait(&sci->sc_wait_daemon, &wait);
- timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
- time_after_eq(jiffies, sci->sc_timer.expires));
if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
set_nilfs_discontinued(nilfs);
+
+ if (should_write)
+ nilfs_segctor_thread_construct(sci, mode);
}
- goto loop;
- end_thread:
/* end sync. */
+ spin_lock(&sci->sc_state_lock);
sci->sc_task = NULL;
timer_shutdown_sync(&sci->sc_timer);
- wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
spin_unlock(&sci->sc_state_lock);
return 0;
}
-static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
-{
- struct task_struct *t;
-
- t = kthread_run(nilfs_segctor_thread, sci, "segctord");
- if (IS_ERR(t)) {
- int err = PTR_ERR(t);
-
- nilfs_err(sci->sc_super, "error %d creating segctord thread",
- err);
- return err;
- }
- wait_event(sci->sc_wait_task, sci->sc_task != NULL);
- return 0;
-}
-
-static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
- __acquires(&sci->sc_state_lock)
- __releases(&sci->sc_state_lock)
-{
- sci->sc_state |= NILFS_SEGCTOR_QUIT;
-
- while (sci->sc_task) {
- wake_up(&sci->sc_wait_daemon);
- spin_unlock(&sci->sc_state_lock);
- wait_event(sci->sc_wait_task, sci->sc_task == NULL);
- spin_lock(&sci->sc_state_lock);
- }
-}
-
/*
* Setup & clean-up functions
*/
@@ -2719,7 +2729,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
init_waitqueue_head(&sci->sc_wait_request);
init_waitqueue_head(&sci->sc_wait_daemon);
- init_waitqueue_head(&sci->sc_wait_task);
spin_lock_init(&sci->sc_state_lock);
INIT_LIST_HEAD(&sci->sc_dirty_files);
INIT_LIST_HEAD(&sci->sc_segbufs);
@@ -2774,8 +2783,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
up_write(&nilfs->ns_segctor_sem);
+ if (sci->sc_task) {
+ wake_up(&sci->sc_wait_daemon);
+ kthread_stop(sci->sc_task);
+ }
+
spin_lock(&sci->sc_state_lock);
- nilfs_segctor_kill_thread(sci);
flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
@@ -2823,14 +2836,15 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
* This allocates a log writer object, initializes it, and starts the
* log writer.
*
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINTR - Log writer thread creation failed due to interruption.
+ * * %-ENOMEM - Insufficient memory available.
*/
int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
{
struct the_nilfs *nilfs = sb->s_fs_info;
+ struct nilfs_sc_info *sci;
+ struct task_struct *t;
int err;
if (nilfs->ns_writer) {
@@ -2843,15 +2857,23 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
return 0;
}
- nilfs->ns_writer = nilfs_segctor_new(sb, root);
- if (!nilfs->ns_writer)
+ sci = nilfs_segctor_new(sb, root);
+ if (unlikely(!sci))
return -ENOMEM;
- err = nilfs_segctor_start_thread(nilfs->ns_writer);
- if (unlikely(err))
+ nilfs->ns_writer = sci;
+ t = kthread_create(nilfs_segctor_thread, sci, "segctord");
+ if (IS_ERR(t)) {
+ err = PTR_ERR(t);
+ nilfs_err(sb, "error %d creating segctord thread", err);
nilfs_detach_log_writer(sb);
+ return err;
+ }
+ sci->sc_task = t;
+ timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0);
- return err;
+ wake_up_process(sci->sc_task);
+ return 0;
}
/**
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 1060f72ebf5a..f723f47ddc4e 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -22,10 +22,10 @@ struct nilfs_root;
* struct nilfs_recovery_info - Recovery information
* @ri_need_recovery: Recovery status
* @ri_super_root: Block number of the last super root
- * @ri_ri_cno: Number of the last checkpoint
+ * @ri_cno: Number of the last checkpoint
* @ri_lsegs_start: Region for roll-forwarding (start block number)
* @ri_lsegs_end: Region for roll-forwarding (end block number)
- * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_lsegs_start_seq: Sequence value of the segment at ri_lsegs_start
* @ri_used_segments: List of segments to be mark active
* @ri_pseg_start: Block number of the last partial segment
* @ri_seq: Sequence number on the last partial segment
@@ -105,9 +105,8 @@ struct nilfs_segsum_pointer {
* @sc_flush_request: inode bitmap of metadata files to be flushed
* @sc_wait_request: Client request queue
* @sc_wait_daemon: Daemon wait queue
- * @sc_wait_task: Start/end wait queue to control segctord task
* @sc_seq_request: Request counter
- * @sc_seq_accept: Accepted request count
+ * @sc_seq_accepted: Accepted request count
* @sc_seq_done: Completion counter
* @sc_sync: Request of explicit sync operation
* @sc_interval: Timeout value of background construction
@@ -158,7 +157,6 @@ struct nilfs_sc_info {
wait_queue_head_t sc_wait_request;
wait_queue_head_t sc_wait_daemon;
- wait_queue_head_t sc_wait_task;
__u32 sc_seq_request;
__u32 sc_seq_accepted;
@@ -171,7 +169,6 @@ struct nilfs_sc_info {
unsigned long sc_watermark;
struct timer_list sc_timer;
- struct task_struct *sc_timer_task;
struct task_struct *sc_task;
};
@@ -192,7 +189,6 @@ enum {
};
/* sc_state */
-#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */
#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */
/*
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 6748218be7c5..eea5a6a12f7b 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -79,10 +79,17 @@ nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
NILFS_MDT(sufile)->mi_entry_size;
}
-static inline int nilfs_sufile_get_header_block(struct inode *sufile,
- struct buffer_head **bhp)
+static int nilfs_sufile_get_header_block(struct inode *sufile,
+ struct buffer_head **bhp)
{
- return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+ int err = nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+
+ if (unlikely(err == -ENOENT)) {
+ nilfs_error(sufile->i_sb,
+ "missing header block in segment usage metadata");
+ err = -EIO;
+ }
+ return err;
}
static inline int
@@ -506,8 +513,15 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
- if (ret)
+ if (unlikely(ret)) {
+ if (ret == -ENOENT) {
+ nilfs_error(sufile->i_sb,
+ "segment usage for segment %llu is unreadable due to a hole block",
+ (unsigned long long)segnum);
+ ret = -EIO;
+ }
goto out_sem;
+ }
kaddr = kmap_local_page(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
@@ -840,21 +854,17 @@ out:
}
/**
- * nilfs_sufile_get_suinfo -
+ * nilfs_sufile_get_suinfo - get segment usage information
* @sufile: inode of segment usage file
* @segnum: segment number to start looking
- * @buf: array of suinfo
- * @sisz: byte size of suinfo
- * @nsi: size of suinfo array
- *
- * Description:
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
+ * @nsi: size of suinfo array
*
- * Return Value: On success, 0 is returned and .... On error, one of the
- * following negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: Count of segment usage info items stored in the output buffer on
+ * success, or the following negative error code on failure.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOMEM - Insufficient memory available.
*/
ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
unsigned int sisz, size_t nsi)
@@ -1241,9 +1251,15 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
if (err)
goto failed;
- err = nilfs_sufile_get_header_block(sufile, &header_bh);
- if (err)
+ err = nilfs_mdt_get_block(sufile, 0, 0, NULL, &header_bh);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ nilfs_err(sb,
+ "missing header block in segment usage metadata");
+ err = -EINVAL;
+ }
goto failed;
+ }
sui = NILFS_SUI(sufile);
kaddr = kmap_local_page(header_bh->b_page);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e835e1f5a712..eca79cca3803 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -105,6 +105,10 @@ static void nilfs_set_error(struct super_block *sb)
/**
* __nilfs_error() - report failure condition on a filesystem
+ * @sb: super block instance
+ * @function: name of calling function
+ * @fmt: format string for message to be output
+ * @...: optional arguments to @fmt
*
* __nilfs_error() sets an ERROR_FS flag on the superblock as well as
* reporting an error message. This function should be called when
@@ -156,6 +160,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
return NULL;
ii->i_bh = NULL;
ii->i_state = 0;
+ ii->i_type = 0;
ii->i_cno = 0;
ii->i_assoc_inode = NULL;
ii->i_bmap = &ii->i_bmap_data;
@@ -1063,6 +1068,10 @@ nilfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
goto failed_nilfs;
+ super_set_uuid(sb, nilfs->ns_sbp[0]->s_uuid,
+ sizeof(nilfs->ns_sbp[0]->s_uuid));
+ super_set_sysfs_name_bdev(sb);
+
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index e44dde57ab65..ac03fd3c330c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -12,7 +12,6 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
-#include <linux/random.h>
#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
@@ -69,7 +68,6 @@ struct the_nilfs *alloc_nilfs(struct super_block *sb)
INIT_LIST_HEAD(&nilfs->ns_dirty_files);
INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
spin_lock_init(&nilfs->ns_inode_lock);
- spin_lock_init(&nilfs->ns_next_gen_lock);
spin_lock_init(&nilfs->ns_last_segment_lock);
nilfs->ns_cptree = RB_ROOT;
spin_lock_init(&nilfs->ns_cptree_lock);
@@ -754,9 +752,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
nilfs->ns_blocksize = blocksize;
- get_random_bytes(&nilfs->ns_next_generation,
- sizeof(nilfs->ns_next_generation));
-
err = nilfs_store_disk_layout(nilfs, sbp);
if (err)
goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1e829ed7b0ef..4776a70f01ae 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -71,8 +71,6 @@ enum {
* @ns_dirty_files: list of dirty files
* @ns_inode_lock: lock protecting @ns_dirty_files
* @ns_gc_inodes: dummy inodes to keep live blocks
- * @ns_next_generation: next generation number for inodes
- * @ns_next_gen_lock: lock protecting @ns_next_generation
* @ns_mount_opt: mount options
* @ns_resuid: uid for reserved blocks
* @ns_resgid: gid for reserved blocks
@@ -161,10 +159,6 @@ struct the_nilfs {
/* GC inode list */
struct list_head ns_gc_inodes;
- /* Inode allocator */
- u32 ns_next_generation;
- spinlock_t ns_next_gen_lock;
-
/* Mount options */
unsigned long ns_mount_opt;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9ec313e9f6e1..13454e5fd3fb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1006,17 +1006,17 @@ static int fanotify_find_path(int dfd, const char __user *filename,
struct fd f = fdget(dfd);
ret = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
ret = -ENOTDIR;
if ((flags & FAN_MARK_ONLYDIR) &&
- !(S_ISDIR(file_inode(f.file)->i_mode))) {
+ !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
fdput(f);
goto out;
}
- *path = f.file->f_path;
+ *path = fd_file(f)->f_path;
path_get(path);
fdput(f);
} else {
@@ -1753,14 +1753,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
f = fdget(fanotify_fd);
- if (unlikely(!f.file))
+ if (unlikely(!fd_file(f)))
return -EBADF;
/* verify that this is indeed an fanotify instance */
ret = -EINVAL;
- if (unlikely(f.file->f_op != &fanotify_fops))
+ if (unlikely(fd_file(f)->f_op != &fanotify_fops))
goto fput_and_out;
- group = f.file->private_data;
+ group = fd_file(f)->private_data;
/*
* An unprivileged user is not allowed to setup mount nor filesystem
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4ffc30606e0b..c7e451d5bd51 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -753,7 +753,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
return -EINVAL;
f = fdget(fd);
- if (unlikely(!f.file))
+ if (unlikely(!fd_file(f)))
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
@@ -763,7 +763,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
}
/* verify that this is indeed an inotify instance */
- if (unlikely(f.file->f_op != &inotify_fops)) {
+ if (unlikely(fd_file(f)->f_op != &inotify_fops)) {
ret = -EINVAL;
goto fput_and_out;
}
@@ -780,7 +780,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
/* inode held in place by reference to path; group by fget on fd */
inode = path.dentry->d_inode;
- group = f.file->private_data;
+ group = fd_file(f)->private_data;
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
@@ -798,14 +798,14 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
int ret = -EINVAL;
f = fdget(fd);
- if (unlikely(!f.file))
+ if (unlikely(!fd_file(f)))
return -EBADF;
/* verify that this is indeed an inotify instance */
- if (unlikely(f.file->f_op != &inotify_fops))
+ if (unlikely(fd_file(f)->f_op != &inotify_fops))
goto out;
- group = f.file->private_data;
+ group = fd_file(f)->private_data;
i_mark = inotify_idr_find(group, wd);
if (unlikely(!i_mark))
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 67ee176b8824..c675fc40ce2d 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -22,7 +22,6 @@ static struct vfsmount *nsfs_mnt;
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = {
- .llseek = no_llseek,
.unlocked_ioctl = ns_ioctl,
.compat_ioctl = compat_ptr_ioctl,
};
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d6c985cc6353..db72b3e924b3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -156,9 +156,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
&ext_flags);
if (err) {
- mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
- "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
- (unsigned long long)p_blkno);
+ mlog(ML_ERROR, "get_blocks() failed, inode: 0x%p, "
+ "block: %llu\n", inode, (unsigned long long)iblock);
goto bail;
}
@@ -1187,7 +1186,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
/* This is the direct io target page. */
if (wc->w_pages[i] == NULL) {
- p_blkno++;
+ p_blkno += (1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits));
continue;
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index cdb9b9bdea1f..8f714406528d 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -235,7 +235,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
- ocfs2_metadata_cache_io_unlock(ci);
status = -ENOMEM;
mlog_errno(status);
/* Don't forget to put previous bh! */
@@ -389,7 +388,8 @@ read_failure:
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- ocfs2_set_buffer_uptodate(ci, bh);
+ if (bh)
+ ocfs2_set_buffer_uptodate(ci, bh);
}
ocfs2_metadata_cache_io_unlock(ci);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1bde1281d514..4b9f45d7049e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1785,17 +1785,17 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
goto out;
f = fdget(fd);
- if (f.file == NULL)
+ if (fd_file(f) == NULL)
goto out;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
goto out2;
- if (!S_ISBLK(f.file->f_mapping->host->i_mode))
+ if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
goto out2;
- reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
+ reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(reg->hr_bdev_file)) {
ret = PTR_ERR(reg->hr_bdev_file);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ccef3f42b333..213206ebdd58 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3512,16 +3512,6 @@ static int dx_leaf_sort_cmp(const void *a, const void *b)
return 0;
}
-static void dx_leaf_sort_swap(void *a, void *b, int size)
-{
- struct ocfs2_dx_entry *entry1 = a;
- struct ocfs2_dx_entry *entry2 = b;
-
- BUG_ON(size != sizeof(*entry1));
-
- swap(*entry1, *entry2);
-}
-
static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
{
struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
@@ -3782,7 +3772,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
*/
sort(dx_leaf->dl_list.de_entries, num_used,
sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
- dx_leaf_sort_swap);
+ NULL);
ocfs2_journal_dirty(handle, dx_leaf_bh);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index da78a04d6f0b..60df52e4c1f8 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3151,11 +3151,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
#ifdef CONFIG_OCFS2_FS_STATS
if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
now = ktime_to_us(ktime_get_real());
- if (lockres->l_lock_prmode.ls_last >
- lockres->l_lock_exmode.ls_last)
- last = lockres->l_lock_prmode.ls_last;
- else
- last = lockres->l_lock_exmode.ls_last;
+ last = max(lockres->l_lock_prmode.ls_last,
+ lockres->l_lock_exmode.ls_last);
/*
* Use d_filter_secs field to filter lock resources dump,
* the default d_filter_secs(0) value filters nothing,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 70a768b623cf..f7672472fa82 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -973,7 +973,13 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
}
while (done < nr) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
+ rc = -EAGAIN;
+ mlog(ML_ERROR,
+ "Inode #%llu ip_alloc_sem is temporarily unavailable\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ break;
+ }
rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
&p_block, &p_count, NULL);
up_read(&OCFS2_I(inode)->ip_alloc_sem);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 530fba34f6d3..1bf188b6866a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1055,7 +1055,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (!igrab(inode))
BUG();
- num_running_trans = atomic_read(&(osb->journal->j_num_trans));
+ num_running_trans = atomic_read(&(journal->j_num_trans));
trace_ocfs2_journal_shutdown(num_running_trans);
/* Do a commit_cache here. It will flush our journal, *and*
@@ -1074,9 +1074,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
osb->commit_task = NULL;
}
- BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
+ BUG_ON(atomic_read(&(journal->j_num_trans)) != 0);
- if (ocfs2_mount_local(osb)) {
+ if (ocfs2_mount_local(osb) &&
+ (journal->j_journal->j_flags & JBD2_LOADED)) {
jbd2_journal_lock_updates(journal->j_journal);
status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5df34561c551..8ac42ea81a17 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1002,6 +1002,25 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
start = bit_off + 1;
}
+ /* clear the contiguous bits until the end boundary */
+ if (count) {
+ blkno = la_start_blk +
+ ocfs2_clusters_to_blocks(osb->sb,
+ start - count);
+
+ trace_ocfs2_sync_local_to_main_free(
+ count, start - count,
+ (unsigned long long)la_start_blk,
+ (unsigned long long)blkno);
+
+ status = ocfs2_release_clusters(handle,
+ main_bm_inode,
+ main_bm_bh, blkno,
+ count);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
bail:
if (status)
mlog_errno(status);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 0575c2d060eb..2b0daced98eb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -371,12 +371,16 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
- if (status < 0)
+ if (status < 0) {
+ mlog_errno(status);
goto out_unlock;
+ }
status = ocfs2_qinfo_lock(oinfo, 0);
- if (status < 0)
+ if (status < 0) {
+ mlog_errno(status);
goto out_unlock;
+ }
status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct ocfs2_global_disk_dqinfo),
OCFS2_GLOBAL_INFO_OFF);
@@ -404,12 +408,11 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
schedule_delayed_work(&oinfo->dqi_sync_work,
msecs_to_jiffies(oinfo->dqi_syncms));
-out_err:
- return status;
+ return 0;
out_unlock:
ocfs2_unlock_global_qf(oinfo, 0);
- mlog_errno(status);
- goto out_err;
+out_err:
+ return status;
}
/* Write information to global quota file. Expects exclusive lock on quota
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8ce462c64c51..73d3367c533b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -692,7 +692,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
int status;
struct buffer_head *bh = NULL;
struct ocfs2_quota_recovery *rec;
- int locked = 0;
+ int locked = 0, global_read = 0;
info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
@@ -700,6 +700,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
if (!oinfo) {
mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
" info.");
+ status = -ENOMEM;
goto out_err;
}
info->dqi_priv = oinfo;
@@ -712,6 +713,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
status = ocfs2_global_read_info(sb, type);
if (status < 0)
goto out_err;
+ global_read = 1;
status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
if (status < 0) {
@@ -782,10 +784,12 @@ out_err:
if (locked)
ocfs2_inode_unlock(lqinode, 1);
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ if (global_read)
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
kfree(oinfo);
}
brelse(bh);
- return -1;
+ return status;
}
/* Write local info to quota file */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 1f303b1adf1a..004393b13c0a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -25,6 +25,7 @@
#include "namei.h"
#include "ocfs2_trace.h"
#include "file.h"
+#include "symlink.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -1392,13 +1393,6 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
return 0;
}
-static void swap_refcount_rec(void *a, void *b, int size)
-{
- struct ocfs2_refcount_rec *l = a, *r = b;
-
- swap(*l, *r);
-}
-
/*
* The refcount cpos are ordered by their 64bit cpos,
* But we will use the low 32 bit to be the e_cpos in the b-tree.
@@ -1474,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
*/
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_low_cpos, NULL);
ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
if (ret) {
@@ -1499,11 +1493,11 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_cpos, NULL);
sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
- cmp_refcount_rec_by_cpos, swap_refcount_rec);
+ cmp_refcount_rec_by_cpos, NULL);
*split_cpos = cpos;
return 0;
@@ -4155,8 +4149,9 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
int ret;
struct inode *inode = d_inode(old_dentry);
struct buffer_head *new_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
- if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+ if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
@@ -4182,6 +4177,26 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out_unlock;
}
+ if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) &&
+ (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ /*
+ * Adjust extent record count to reserve space for extended attribute.
+ * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+ */
+ struct ocfs2_inode_info *new_oi = OCFS2_I(new_inode);
+
+ if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+ !(ocfs2_inode_is_fast_symlink(new_inode))) {
+ struct ocfs2_dinode *new_di = (struct ocfs2_dinode *)new_bh->b_data;
+ struct ocfs2_dinode *old_di = (struct ocfs2_dinode *)old_bh->b_data;
+ struct ocfs2_extent_list *el = &new_di->id2.i_list;
+ int inline_size = le16_to_cpu(old_di->i_xattr_inline_size);
+
+ le16_add_cpu(&el->l_count, -(inline_size /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ }
+
ret = ocfs2_create_reflink_node(inode, old_bh,
new_inode, new_bh, preserve);
if (ret) {
@@ -4189,7 +4204,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto inode_unlock;
}
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+ if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
ret = ocfs2_reflink_xattrs(inode, old_bh,
new_inode, new_bh,
preserve);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index afee70125ae3..3d404624bb96 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1571,15 +1571,13 @@ static int __init ocfs2_init(void)
ocfs2_set_locking_protocol();
- status = register_quota_format(&ocfs2_quota_format);
- if (status < 0)
- goto out3;
+ register_quota_format(&ocfs2_quota_format);
+
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out3:
debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
@@ -2357,8 +2355,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
(unsigned long long)bh->b_blocknr);
} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
- mlog(ML_ERROR, "bad cluster size found: %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
+ mlog(ML_ERROR, "bad cluster size bit found: %u\n",
+ le32_to_cpu(di->id2.i_super.s_clustersize_bits));
} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
mlog(ML_ERROR, "bad root_blkno: 0\n");
} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 35c0cc2a51af..dd0a05365e79 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4167,15 +4167,6 @@ static int cmp_xe(const void *a, const void *b)
return 0;
}
-static void swap_xe(void *a, void *b, int size)
-{
- struct ocfs2_xattr_entry *l = a, *r = b, tmp;
-
- tmp = *l;
- memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
- memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
-}
-
/*
* When the ocfs2_xattr_block is filled up, new bucket will be created
* and all the xattr entries will be moved to the new bucket.
@@ -4241,7 +4232,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
- cmp_xe, swap_xe);
+ cmp_xe, NULL);
}
/*
@@ -4436,7 +4427,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
*/
sort(entries, le16_to_cpu(xh->xh_count),
sizeof(struct ocfs2_xattr_entry),
- cmp_xe_offset, swap_xe);
+ cmp_xe_offset, NULL);
/* Move all name/values to the end of the bucket. */
xe = xh->xh_entries;
@@ -4478,7 +4469,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
/* sort the entries by their name_hash. */
sort(entries, le16_to_cpu(xh->xh_count),
sizeof(struct ocfs2_xattr_entry),
- cmp_xe, swap_xe);
+ cmp_xe, NULL);
buf = bucket_buf;
for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
@@ -6520,16 +6511,7 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
}
new_oi = OCFS2_I(args->new_inode);
- /*
- * Adjust extent record count to reserve space for extended attribute.
- * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
- */
- if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
- !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
- struct ocfs2_extent_list *el = &new_di->id2.i_list;
- le16_add_cpu(&el->l_count, -(inline_size /
- sizeof(struct ocfs2_extent_rec)));
- }
+
spin_lock(&new_oi->ip_lock);
new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/open.c b/fs/open.c
index daf1b55ca818..acaeb3e25c88 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -193,10 +193,10 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (length < 0)
return -EINVAL;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = do_ftruncate(f.file, length, small);
+ error = do_ftruncate(fd_file(f), length, small);
fdput(f);
return error;
@@ -352,8 +352,8 @@ int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
struct fd f = fdget(fd);
int error = -EBADF;
- if (f.file) {
- error = vfs_fallocate(f.file, mode, offset, len);
+ if (fd_file(f)) {
+ error = vfs_fallocate(fd_file(f), mode, offset, len);
fdput(f);
}
return error;
@@ -584,16 +584,16 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
int error;
error = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
goto out;
error = -ENOTDIR;
- if (!d_can_lookup(f.file->f_path.dentry))
+ if (!d_can_lookup(fd_file(f)->f_path.dentry))
goto out_putf;
- error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
+ error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
if (!error)
- set_fs_pwd(current->fs, &f.file->f_path);
+ set_fs_pwd(current->fs, &fd_file(f)->f_path);
out_putf:
fdput(f);
out:
@@ -674,8 +674,8 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
struct fd f = fdget(fd);
int err = -EBADF;
- if (f.file) {
- err = vfs_fchmod(f.file, mode);
+ if (fd_file(f)) {
+ err = vfs_fchmod(fd_file(f), mode);
fdput(f);
}
return err;
@@ -868,8 +868,8 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
struct fd f = fdget(fd);
int error = -EBADF;
- if (f.file) {
- error = vfs_fchown(f.file, user, group);
+ if (fd_file(f)) {
+ error = vfs_fchown(fd_file(f), user, group);
fdput(f);
}
return error;
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index be4ba03a01a0..04e15dfa504a 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -904,7 +904,7 @@ static void orangefs_obj_release(struct kobject *kobj)
orangefs_obj = NULL;
}
-static struct kobj_type orangefs_ktype = {
+static const struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = orangefs_default_groups,
.release = orangefs_obj_release,
@@ -951,7 +951,7 @@ static void acache_orangefs_obj_release(struct kobject *kobj)
acache_orangefs_obj = NULL;
}
-static struct kobj_type acache_orangefs_ktype = {
+static const struct kobj_type acache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = acache_orangefs_default_groups,
.release = acache_orangefs_obj_release,
@@ -998,7 +998,7 @@ static void capcache_orangefs_obj_release(struct kobject *kobj)
capcache_orangefs_obj = NULL;
}
-static struct kobj_type capcache_orangefs_ktype = {
+static const struct kobj_type capcache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = capcache_orangefs_default_groups,
.release = capcache_orangefs_obj_release,
@@ -1045,7 +1045,7 @@ static void ccache_orangefs_obj_release(struct kobject *kobj)
ccache_orangefs_obj = NULL;
}
-static struct kobj_type ccache_orangefs_ktype = {
+static const struct kobj_type ccache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ccache_orangefs_default_groups,
.release = ccache_orangefs_obj_release,
@@ -1092,7 +1092,7 @@ static void ncache_orangefs_obj_release(struct kobject *kobj)
ncache_orangefs_obj = NULL;
}
-static struct kobj_type ncache_orangefs_ktype = {
+static const struct kobj_type ncache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ncache_orangefs_default_groups,
.release = ncache_orangefs_obj_release,
@@ -1132,7 +1132,7 @@ static void pc_orangefs_obj_release(struct kobject *kobj)
pc_orangefs_obj = NULL;
}
-static struct kobj_type pc_orangefs_ktype = {
+static const struct kobj_type pc_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = pc_orangefs_default_groups,
.release = pc_orangefs_obj_release,
@@ -1165,7 +1165,7 @@ static void stats_orangefs_obj_release(struct kobject *kobj)
stats_orangefs_obj = NULL;
}
-static struct kobj_type stats_orangefs_ktype = {
+static const struct kobj_type stats_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = stats_orangefs_default_groups,
.release = stats_orangefs_obj_release,
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 1a411cae57ed..4504493b20be 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -93,11 +93,11 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
bool allow_meta)
{
struct dentry *dentry = file_dentry(file);
+ struct file *realfile = file->private_data;
struct path realpath;
int err;
- real->flags = 0;
- real->file = file->private_data;
+ real->word = (unsigned long)realfile;
if (allow_meta) {
ovl_path_real(dentry, &realpath);
@@ -113,16 +113,17 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
return -EIO;
/* Has it been copied up since we'd opened it? */
- if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
- real->flags = FDPUT_FPUT;
- real->file = ovl_open_realfile(file, &realpath);
-
- return PTR_ERR_OR_ZERO(real->file);
+ if (unlikely(file_inode(realfile) != d_inode(realpath.dentry))) {
+ struct file *f = ovl_open_realfile(file, &realpath);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ real->word = (unsigned long)f | FDPUT_FPUT;
+ return 0;
}
/* Did the flags change since open? */
- if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
- return ovl_change_flags(real->file, file->f_flags);
+ if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS))
+ return ovl_change_flags(realfile, file->f_flags);
return 0;
}
@@ -130,10 +131,11 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
static int ovl_real_fdget(const struct file *file, struct fd *real)
{
if (d_is_dir(file_dentry(file))) {
- real->flags = 0;
- real->file = ovl_dir_real_file(file, false);
-
- return PTR_ERR_OR_ZERO(real->file);
+ struct file *f = ovl_dir_real_file(file, false);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+ real->word = (unsigned long)f;
+ return 0;
}
return ovl_real_fdget_meta(file, real, false);
@@ -209,13 +211,13 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
* files, so we use the real file to perform seeks.
*/
ovl_inode_lock(inode);
- real.file->f_pos = file->f_pos;
+ fd_file(real)->f_pos = file->f_pos;
old_cred = ovl_override_creds(inode->i_sb);
- ret = vfs_llseek(real.file, offset, whence);
+ ret = vfs_llseek(fd_file(real), offset, whence);
revert_creds(old_cred);
- file->f_pos = real.file->f_pos;
+ file->f_pos = fd_file(real)->f_pos;
ovl_inode_unlock(inode);
fdput(real);
@@ -275,7 +277,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
- ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+ ret = backing_file_read_iter(fd_file(real), iter, iocb, iocb->ki_flags,
&ctx);
fdput(real);
@@ -314,7 +316,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
- ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
+ ret = backing_file_write_iter(fd_file(real), iter, iocb, ifl, &ctx);
fdput(real);
out_unlock:
@@ -339,7 +341,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
if (ret)
return ret;
- ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
+ ret = backing_file_splice_read(fd_file(real), ppos, pipe, len, flags, &ctx);
fdput(real);
return ret;
@@ -348,7 +350,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
/*
* Calling iter_file_splice_write() directly from overlay's f_op may deadlock
* due to lock order inversion between pipe->mutex in iter_file_splice_write()
- * and file_start_write(real.file) in ovl_write_iter().
+ * and file_start_write(fd_file(real)) in ovl_write_iter().
*
* So do everything ovl_write_iter() does and call iter_file_splice_write() on
* the real file.
@@ -373,7 +375,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (ret)
goto out_unlock;
- ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
+ ret = backing_file_splice_write(pipe, fd_file(real), ppos, len, flags, &ctx);
fdput(real);
out_unlock:
@@ -397,9 +399,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
/* Don't sync lower file for fear of receiving EROFS error */
- if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
+ if (file_inode(fd_file(real)) == ovl_inode_upper(file_inode(file))) {
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fsync_range(real.file, start, end, datasync);
+ ret = vfs_fsync_range(fd_file(real), start, end, datasync);
revert_creds(old_cred);
}
@@ -439,7 +441,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fallocate(real.file, mode, offset, len);
+ ret = vfs_fallocate(fd_file(real), mode, offset, len);
revert_creds(old_cred);
/* Update size */
@@ -464,7 +466,7 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
return ret;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fadvise(real.file, offset, len, advice);
+ ret = vfs_fadvise(fd_file(real), offset, len, advice);
revert_creds(old_cred);
fdput(real);
@@ -509,18 +511,18 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
switch (op) {
case OVL_COPY:
- ret = vfs_copy_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len, flags);
+ ret = vfs_copy_file_range(fd_file(real_in), pos_in,
+ fd_file(real_out), pos_out, len, flags);
break;
case OVL_CLONE:
- ret = vfs_clone_file_range(real_in.file, pos_in,
- real_out.file, pos_out, len, flags);
+ ret = vfs_clone_file_range(fd_file(real_in), pos_in,
+ fd_file(real_out), pos_out, len, flags);
break;
case OVL_DEDUPE:
- ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
- real_out.file, pos_out, len,
+ ret = vfs_dedupe_file_range_one(fd_file(real_in), pos_in,
+ fd_file(real_out), pos_out, len,
flags);
break;
}
@@ -583,9 +585,9 @@ static int ovl_flush(struct file *file, fl_owner_t id)
if (err)
return err;
- if (real.file->f_op->flush) {
+ if (fd_file(real)->f_op->flush) {
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- err = real.file->f_op->flush(real.file, id);
+ err = fd_file(real)->f_op->flush(fd_file(real), id);
revert_creds(old_cred);
}
fdput(real);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 7ffdc88dfb52..80675b6bf884 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -120,6 +120,7 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct nsproxy *nsp __free(put_nsproxy) = NULL;
struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
+ struct pid_namespace *pid_ns;
if (arg)
return -EINVAL;
@@ -202,7 +203,9 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PIDFD_GET_PID_NAMESPACE:
if (IS_ENABLED(CONFIG_PID_NS)) {
rcu_read_lock();
- ns_common = to_ns_common( get_pid_ns(task_active_pid_ns(task)));
+ pid_ns = task_active_pid_ns(task);
+ if (pid_ns)
+ ns_common = to_ns_common(get_pid_ns(pid_ns));
rcu_read_unlock();
}
break;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4083ba492cb6..12b22c2723b7 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1231,7 +1231,6 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
- .llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
.poll = pipe_poll,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e7810f3bd522..b31283d81c52 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2626,7 +2626,7 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (task_is_realtime(p))
+ if (rt_or_dl_task_policy(p))
slack_ns = 0;
else if (slack_ns == 0)
slack_ns = p->default_timer_slack_ns;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d19434e2a58e..626ad7bd94f2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -303,9 +303,7 @@ static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_read) read;
-
- read = pde->proc_ops->proc_read;
+ __auto_type read = pde->proc_ops->proc_read;
if (read)
return read(file, buf, count, ppos);
return -EIO;
@@ -327,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_write) write;
-
- write = pde->proc_ops->proc_write;
+ __auto_type write = pde->proc_ops->proc_write;
if (write)
return write(file, buf, count, ppos);
return -EIO;
@@ -351,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
- typeof_member(struct proc_ops, proc_poll) poll;
-
- poll = pde->proc_ops->proc_poll;
+ __auto_type poll = pde->proc_ops->proc_poll;
if (poll)
return poll(file, pts);
return DEFAULT_POLLMASK;
@@ -375,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
-
- ioctl = pde->proc_ops->proc_ioctl;
+ __auto_type ioctl = pde->proc_ops->proc_ioctl;
if (ioctl)
return ioctl(file, cmd, arg);
return -ENOTTY;
@@ -400,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl;
if (compat_ioctl)
return compat_ioctl(file, cmd, arg);
return -ENOTTY;
@@ -424,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
- typeof_member(struct proc_ops, proc_mmap) mmap;
-
- mmap = pde->proc_ops->proc_mmap;
+ __auto_type mmap = pde->proc_ops->proc_mmap;
if (mmap)
return mmap(file, vma);
return -EIO;
@@ -483,7 +471,6 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
typeof_member(struct proc_ops, proc_open) open;
- typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
if (!pde->proc_ops->proc_lseek)
@@ -510,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_ops->proc_release;
+ __auto_type release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -547,9 +534,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
if (pde_is_permanent(pde)) {
- typeof_member(struct proc_ops, proc_release) release;
-
- release = pde->proc_ops->proc_release;
+ __auto_type release = pde->proc_ops->proc_release;
if (release) {
return release(inode, file);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9e3f25e4c188..87e4d6282025 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -166,8 +166,7 @@ static inline int folio_precise_page_mapcount(struct folio *folio,
{
int mapcount = atomic_read(&page->_mapcount) + 1;
- /* Handle page_has_type() pages */
- if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+ if (page_mapcount_is_type(mapcount))
mapcount = 0;
if (folio_test_large(folio))
mapcount += folio_entire_mapcount(folio);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b7a5c84b5819..a55f5acefa97 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -182,7 +182,6 @@ u64 stable_page_flags(const struct page *page)
#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -207,18 +206,16 @@ u64 stable_page_flags(const struct page *page)
u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
-#endif
-
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9553e77c9d31..d11ebc055ce0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -29,8 +29,13 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* Support for permanently empty directories */
-static struct ctl_table sysctl_mount_point[] = { };
+/*
+ * Support for permanently empty directories.
+ * Must be non-empty to avoid sharing an address with other tables.
+ */
+static struct ctl_table sysctl_mount_point[] = {
+ { }
+};
/**
* register_sysctl_mount_point() - registers a sysctl mount point
@@ -42,7 +47,7 @@ static struct ctl_table sysctl_mount_point[] = { };
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl(path, sysctl_mount_point);
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ade74a396968..72f14fd59c2d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -543,21 +543,6 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
}
}
- if (karg.build_id_size) {
- __u32 build_id_sz;
-
- err = build_id_parse(vma, build_id_buf, &build_id_sz);
- if (err) {
- karg.build_id_size = 0;
- } else {
- if (karg.build_id_size < build_id_sz) {
- err = -ENAMETOOLONG;
- goto out;
- }
- karg.build_id_size = build_id_sz;
- }
- }
-
if (karg.vma_name_size) {
size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
const struct path *path;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 84719e2bcbc4..f56b066ab80c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -275,7 +275,7 @@ void pstore_record_init(struct pstore_record *record,
* end of the buffer.
*/
static void pstore_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ struct kmsg_dump_detail *detail)
{
struct kmsg_dump_iter iter;
unsigned long total = 0;
@@ -285,9 +285,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
int saved_ret = 0;
int ret;
- why = kmsg_dump_reason_str(reason);
+ why = kmsg_dump_reason_str(detail->reason);
- if (pstore_cannot_block_path(reason)) {
+ if (pstore_cannot_block_path(detail->reason)) {
if (!raw_spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
pr_err("dump skipped in %s path because of concurrent dump\n",
in_nmi() ? "NMI" : why);
@@ -311,7 +311,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
pstore_record_init(&record, psinfo);
record.type = PSTORE_TYPE_DMESG;
record.count = oopscount;
- record.reason = reason;
+ record.reason = detail->reason;
record.part = part;
record.buf = psinfo->buf;
@@ -352,7 +352,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
}
ret = psinfo->write(&record);
- if (ret == 0 && reason == KMSG_DUMP_OOPS) {
+ if (ret == 0 && detail->reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
pstore_timer_kick();
} else {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7ae885e6d5d7..b40410cd39af 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -163,13 +163,12 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;
-int register_quota_format(struct quota_format_type *fmt)
+void register_quota_format(struct quota_format_type *fmt)
{
spin_lock(&dq_list_lock);
fmt->qf_next = quota_formats;
quota_formats = fmt;
spin_unlock(&dq_list_lock);
- return 0;
}
EXPORT_SYMBOL(register_quota_format);
@@ -1831,7 +1830,6 @@ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
- return;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1873,7 +1871,6 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
- return;
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -2406,7 +2403,7 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
- struct quota_format_type *fmt = find_quota_format(format_id);
+ struct quota_format_type *fmt;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
@@ -2416,6 +2413,7 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
return -EINVAL;
+ fmt = find_quota_format(format_id);
if (!fmt)
return -ESRCH;
if (!sb->dq_op || !sb->s_qcop ||
@@ -2596,7 +2594,8 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
goto out_err;
}
if (sb_has_quota_limits_enabled(sb, type)) {
- ret = -EBUSY;
+ /* compatible with XFS */
+ ret = -EEXIST;
goto out_err;
}
spin_lock(&dq_state_lock);
@@ -2610,9 +2609,6 @@ out_err:
if (flags & qtype_enforce_flag(type))
dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
- /* Error code translation for better compatibility with XFS */
- if (ret == -EBUSY)
- ret = -EEXIST;
return ret;
}
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0e41fb84060f..290157bc7bec 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -980,7 +980,7 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
int ret;
f = fdget_raw(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
ret = -EINVAL;
@@ -988,12 +988,12 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
goto out;
if (quotactl_cmd_write(cmds)) {
- ret = mnt_want_write(f.file->f_path.mnt);
+ ret = mnt_want_write(fd_file(f)->f_path.mnt);
if (ret)
goto out;
}
- sb = f.file->f_path.mnt->mnt_sb;
+ sb = fd_file(f)->f_path.mnt->mnt_sb;
if (quotactl_cmd_onoff(cmds))
down_write(&sb->s_umount);
else
@@ -1007,7 +1007,7 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
up_read(&sb->s_umount);
if (quotactl_cmd_write(cmds))
- mnt_drop_write(f.file->f_path.mnt);
+ mnt_drop_write(fd_file(f)->f_path.mnt);
out:
fdput(f);
return ret;
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 3f3e8acc05db..6f7f0b4afba9 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -235,7 +235,8 @@ static struct quota_format_type v1_quota_format = {
static int __init init_v1_quota_format(void)
{
- return register_quota_format(&v1_quota_format);
+ register_quota_format(&v1_quota_format);
+ return 0;
}
static void __exit exit_v1_quota_format(void)
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index c48c233f3bef..1fda93dcbc1b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -440,12 +440,9 @@ static struct quota_format_type v2r1_quota_format = {
static int __init init_v2_quota_format(void)
{
- int ret;
-
- ret = register_quota_format(&v2r0_quota_format);
- if (ret)
- return ret;
- return register_quota_format(&v2r1_quota_format);
+ register_quota_format(&v2r0_quota_format);
+ register_quota_format(&v2r1_quota_format);
+ return 0;
}
static void __exit exit_v2_quota_format(void)
diff --git a/fs/read_write.c b/fs/read_write.c
index 070a7c33b9dd..64dc24afdb3a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -387,12 +387,12 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
off_t retval;
struct fd f = fdget_pos(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
retval = -EINVAL;
if (whence <= SEEK_MAX) {
- loff_t res = vfs_llseek(f.file, offset, whence);
+ loff_t res = vfs_llseek(fd_file(f), offset, whence);
retval = res;
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
@@ -423,14 +423,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
struct fd f = fdget_pos(fd);
loff_t offset;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
retval = -EINVAL;
if (whence > SEEK_MAX)
goto out_putf;
- offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
+ offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
whence);
retval = (int)offset;
@@ -703,15 +703,15 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_read(f.file, buf, count, ppos);
+ ret = vfs_read(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
return ret;
@@ -727,15 +727,15 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_write(f.file, buf, count, ppos);
+ ret = vfs_write(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
@@ -758,10 +758,10 @@ ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_read(f.file, buf, count, &pos);
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ ret = vfs_read(fd_file(f), buf, count, &pos);
fdput(f);
}
@@ -792,10 +792,10 @@ ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_write(f.file, buf, count, &pos);
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ ret = vfs_write(fd_file(f), buf, count, &pos);
fdput(f);
}
@@ -1078,15 +1078,15 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+ ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
@@ -1102,15 +1102,15 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
- if (f.file) {
- loff_t pos, *ppos = file_ppos(f.file);
+ if (fd_file(f)) {
+ loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
ppos = &pos;
}
- ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+ ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
- f.file->f_pos = pos;
+ fd_file(f)->f_pos = pos;
fdput_pos(f);
}
@@ -1136,10 +1136,10 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos, flags);
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
fdput(f);
}
@@ -1159,10 +1159,10 @@ static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
return -EINVAL;
f = fdget(fd);
- if (f.file) {
+ if (fd_file(f)) {
ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos, flags);
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
fdput(f);
}
@@ -1328,19 +1328,19 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
*/
retval = -EBADF;
in = fdget(in_fd);
- if (!in.file)
+ if (!fd_file(in))
goto out;
- if (!(in.file->f_mode & FMODE_READ))
+ if (!(fd_file(in)->f_mode & FMODE_READ))
goto fput_in;
retval = -ESPIPE;
if (!ppos) {
- pos = in.file->f_pos;
+ pos = fd_file(in)->f_pos;
} else {
pos = *ppos;
- if (!(in.file->f_mode & FMODE_PREAD))
+ if (!(fd_file(in)->f_mode & FMODE_PREAD))
goto fput_in;
}
- retval = rw_verify_area(READ, in.file, &pos, count);
+ retval = rw_verify_area(READ, fd_file(in), &pos, count);
if (retval < 0)
goto fput_in;
if (count > MAX_RW_COUNT)
@@ -1351,13 +1351,13 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
*/
retval = -EBADF;
out = fdget(out_fd);
- if (!out.file)
+ if (!fd_file(out))
goto fput_in;
- if (!(out.file->f_mode & FMODE_WRITE))
+ if (!(fd_file(out)->f_mode & FMODE_WRITE))
goto fput_out;
- in_inode = file_inode(in.file);
- out_inode = file_inode(out.file);
- out_pos = out.file->f_pos;
+ in_inode = file_inode(fd_file(in));
+ out_inode = file_inode(fd_file(out));
+ out_pos = fd_file(out)->f_pos;
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
@@ -1377,33 +1377,33 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
* and the application is arguably buggy if it doesn't expect
* EAGAIN on a non-blocking file descriptor.
*/
- if (in.file->f_flags & O_NONBLOCK)
+ if (fd_file(in)->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
- opipe = get_pipe_info(out.file, true);
+ opipe = get_pipe_info(fd_file(out), true);
if (!opipe) {
- retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+ retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
if (retval < 0)
goto fput_out;
- retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
+ retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
count, fl);
} else {
- if (out.file->f_flags & O_NONBLOCK)
+ if (fd_file(out)->f_flags & O_NONBLOCK)
fl |= SPLICE_F_NONBLOCK;
- retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
+ retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
}
if (retval > 0) {
add_rchar(current, retval);
add_wchar(current, retval);
- fsnotify_access(in.file);
- fsnotify_modify(out.file);
- out.file->f_pos = out_pos;
+ fsnotify_access(fd_file(in));
+ fsnotify_modify(fd_file(out));
+ fd_file(out)->f_pos = out_pos;
if (ppos)
*ppos = pos;
else
- in.file->f_pos = pos;
+ fd_file(in)->f_pos = pos;
}
inc_syscr(current);
@@ -1676,11 +1676,11 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
ssize_t ret = -EBADF;
f_in = fdget(fd_in);
- if (!f_in.file)
+ if (!fd_file(f_in))
goto out2;
f_out = fdget(fd_out);
- if (!f_out.file)
+ if (!fd_file(f_out))
goto out1;
ret = -EFAULT;
@@ -1688,21 +1688,21 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
goto out;
} else {
- pos_in = f_in.file->f_pos;
+ pos_in = fd_file(f_in)->f_pos;
}
if (off_out) {
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
goto out;
} else {
- pos_out = f_out.file->f_pos;
+ pos_out = fd_file(f_out)->f_pos;
}
ret = -EINVAL;
if (flags != 0)
goto out;
- ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
flags);
if (ret > 0) {
pos_in += ret;
@@ -1712,14 +1712,14 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
ret = -EFAULT;
} else {
- f_in.file->f_pos = pos_in;
+ fd_file(f_in)->f_pos = pos_in;
}
if (off_out) {
if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
ret = -EFAULT;
} else {
- f_out.file->f_pos = pos_out;
+ fd_file(f_out)->f_pos = pos_out;
}
}
diff --git a/fs/readdir.c b/fs/readdir.c
index d6c82421902a..6d29cab8576e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -225,10 +225,10 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
.dirent = dirent
};
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
@@ -318,10 +318,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
int error;
f = fdget_pos(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(fd_file(f), &buf.ctx);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -401,10 +401,10 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
int error;
f = fdget_pos(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(fd_file(f), &buf.ctx);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
@@ -483,10 +483,10 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
.dirent = dirent
};
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
@@ -569,10 +569,10 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
int error;
f = fdget_pos(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = iterate_dir(f.file, &buf.ctx);
+ error = iterate_dir(fd_file(f), &buf.ctx);
if (error >= 0)
error = buf.error;
if (buf.prev_reclen) {
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 28246dfc8485..4403d5c68fcb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -537,7 +537,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
for (i = 0, info = same->info; i < count; i++, info++) {
struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = dst_fd.file;
+ struct file *dst_file = fd_file(dst_fd);
if (!dst_file) {
info->status = -EBADF;
diff --git a/fs/select.c b/fs/select.c
index cae82e9e0dcc..a77907faf2b4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -529,10 +529,10 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
continue;
mask = EPOLLNVAL;
f = fdget(i);
- if (f.file) {
+ if (fd_file(f)) {
wait_key_set(wait, in, out, bit,
busy_flag);
- mask = vfs_poll(f.file, wait);
+ mask = vfs_poll(fd_file(f), wait);
fdput(f);
}
@@ -777,7 +777,9 @@ static inline int get_sigset_argpack(struct sigset_argpack *to,
{
// the path is hot enough for overhead of copy_from_user() to matter
if (from) {
- if (!user_read_access_begin(from, sizeof(*from)))
+ if (can_do_masked_user_access())
+ from = masked_user_access_begin(from);
+ else if (!user_read_access_begin(from, sizeof(*from)))
return -EFAULT;
unsafe_get_user(to->p, &from->p, Efault);
unsafe_get_user(to->size, &from->size, Efault);
@@ -861,13 +863,13 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
goto out;
mask = EPOLLNVAL;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
goto out;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
pwait->_key = filter | busy_flag;
- mask = vfs_poll(f.file, pwait);
+ mask = vfs_poll(fd_file(f), pwait);
if (mask & busy_flag)
*can_busy_poll = true;
mask &= filter; /* Mask out unneeded events. */
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d0333bce015e..736bebf93591 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -289,10 +289,10 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
fd_install(ufd, file);
} else {
struct fd f = fdget(ufd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- ctx = f.file->private_data;
- if (f.file->f_op != &signalfd_fops) {
+ ctx = fd_file(f)->private_data;
+ if (fd_file(f)->f_op != &signalfd_fops) {
fdput(f);
return -EINVAL;
}
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 7481b21a0489..2d851f596a72 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -416,7 +416,7 @@ find_timestamp(struct cifs_ses *ses)
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
- const struct nls_table *nls_cp)
+ const struct nls_table *nls_cp, struct shash_desc *hmacmd5)
{
int rc = 0;
int len;
@@ -425,34 +425,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
wchar_t *domain;
wchar_t *server;
- if (!ses->server->secmech.hmacmd5) {
- cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
- return -1;
- }
-
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash, nls_cp);
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash,
- CIFS_NTHASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
+ cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc);
return rc;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
return rc;
}
/* convert ses->user_name to unicode */
len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (user == NULL) {
- rc = -ENOMEM;
- return rc;
- }
+ if (user == NULL)
+ return -ENOMEM;
if (len) {
len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
@@ -461,11 +453,10 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
*(u16 *)user = 0;
}
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)user, 2 * len);
+ rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len);
kfree(user);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc);
return rc;
}
@@ -474,19 +465,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = strlen(ses->domainName);
domain = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (domain == NULL) {
- rc = -ENOMEM;
- return rc;
- }
+ if (domain == NULL)
+ return -ENOMEM;
+
len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
nls_cp);
- rc =
- crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)domain, 2 * len);
+ rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len);
kfree(domain);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with domain\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc);
return rc;
}
} else {
@@ -494,33 +481,27 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = strlen(ses->ip_addr);
server = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (server == NULL) {
- rc = -ENOMEM;
- return rc;
- }
- len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len,
- nls_cp);
- rc =
- crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)server, 2 * len);
+ if (server == NULL)
+ return -ENOMEM;
+
+ len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp);
+ rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len);
kfree(server);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with server\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc);
return rc;
}
}
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ntlmv2_hash);
+ rc = crypto_shash_final(hmacmd5, ntlmv2_hash);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
return rc;
}
static int
-CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5)
{
int rc;
struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
@@ -531,43 +512,33 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
offsetof(struct ntlmv2_resp, challenge.key[0]));
- if (!ses->server->secmech.hmacmd5) {
- cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
- return -1;
- }
-
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
- ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
return rc;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
return rc;
}
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
- memcpy(ntlmv2->challenge.key,
- ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+ memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
- memcpy(ntlmv2->challenge.key,
- ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- ntlmv2->challenge.key, hash_len);
+ memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+
+ rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
return rc;
}
/* Note that the MD5 digest over writes anon.challenge_key.key */
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ntlmv2->ntlmv2_hash);
+ rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
return rc;
}
@@ -575,6 +546,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
+ struct shash_desc *hmacmd5 = NULL;
int rc;
int baselen;
unsigned int tilen;
@@ -640,55 +612,51 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_server_lock(ses->server);
- rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5);
+ rc = cifs_alloc_hash("hmac(md5)", &hmacmd5);
if (rc) {
+ cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc);
goto unlock;
}
/* calculate ntlmv2_hash */
- rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5);
if (rc) {
- cifs_dbg(VFS, "Could not get v2 hash rc %d\n", rc);
+ cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc);
goto unlock;
}
/* calculate first part of the client response (CR1) */
- rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+ rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5);
if (rc) {
- cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
+ cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc);
goto unlock;
}
/* now calculate the session key for NTLMv2 */
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
- ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- ntlmv2->ntlmv2_hash,
- CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ses->auth_key.response);
+ rc = crypto_shash_final(hmacmd5, ses->auth_key.response);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
-
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
unlock:
cifs_server_unlock(ses->server);
+ cifs_free_hash(&hmacmd5);
setup_ntlmv2_rsp_ret:
kfree_sensitive(tiblob);
@@ -732,16 +700,19 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
cifs_free_hash(&server->secmech.aes_cmac);
cifs_free_hash(&server->secmech.hmacsha256);
cifs_free_hash(&server->secmech.md5);
- cifs_free_hash(&server->secmech.sha512);
- cifs_free_hash(&server->secmech.hmacmd5);
- if (server->secmech.enc) {
- crypto_free_aead(server->secmech.enc);
- server->secmech.enc = NULL;
- }
+ if (!SERVER_IS_CHAN(server)) {
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
+ }
- if (server->secmech.dec) {
- crypto_free_aead(server->secmech.dec);
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
+ server->secmech.dec = NULL;
+ }
+ } else {
+ server->secmech.enc = NULL;
server->secmech.dec = NULL;
}
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 61ded59b858f..71b720dbb2ce 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -146,6 +146,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 50
-#define CIFS_VERSION "2.50"
+#define SMB3_PRODUCT_BUILD 51
+#define CIFS_VERSION "2.51"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index a71a988a92f9..315aac5dec05 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -178,10 +178,8 @@ struct session_key {
/* crypto hashing related structure/fields, not specific to a sec mech */
struct cifs_secmech {
- struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */
struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */
struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */
- struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */
struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */
struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */
@@ -821,6 +819,7 @@ struct TCP_Server_Info {
* format: \\HOST\SHARE[\OPTIONAL PATH]
*/
char *leaf_fullpath;
+ bool dfs_conn:1;
};
static inline bool is_smb1(struct TCP_Server_Info *server)
@@ -1059,6 +1058,7 @@ struct cifs_ses {
struct list_head smb_ses_list;
struct list_head rlist; /* reconnect list */
struct list_head tcon_list;
+ struct list_head dlist; /* dfs list */
struct cifs_tcon *tcon_ipc;
spinlock_t ses_lock; /* protect anything here that is not protected */
struct mutex session_mutex;
@@ -1287,6 +1287,7 @@ struct cifs_tcon {
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
struct delayed_work dfs_cache_work;
+ struct list_head dfs_ses_list;
#endif
struct delayed_work query_interfaces; /* query interfaces workqueue job */
char *origin_fullpath; /* canonical copy of smb3_fs_context::source */
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index c69e3f48a60c..68c716e6261b 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -724,15 +724,9 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
-/* Put references of @ses and its children */
static inline void cifs_put_smb_ses(struct cifs_ses *ses)
{
- struct cifs_ses *next;
-
- do {
- next = ses->dfs_root_ses;
- __cifs_put_smb_ses(ses);
- } while ((ses = next));
+ __cifs_put_smb_ses(ses);
}
/* Get an active reference of @ses and its children.
@@ -746,9 +740,7 @@ static inline void cifs_put_smb_ses(struct cifs_ses *ses)
static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses)
{
lockdep_assert_held(&cifs_tcp_ses_lock);
-
- for (; ses; ses = ses->dfs_root_ses)
- ses->ses_count++;
+ ses->ses_count++;
}
static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 08a41c7aaf72..adf8758847f6 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -811,13 +811,9 @@ cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter
unsigned int to_read)
{
struct msghdr smb_msg = { .msg_iter = *iter };
- int ret;
iov_iter_truncate(&smb_msg.msg_iter, to_read);
- ret = cifs_readv_from_socket(server, &smb_msg);
- if (ret > 0)
- iov_iter_advance(iter, ret);
- return ret;
+ return cifs_readv_from_socket(server, &smb_msg);
}
static bool
@@ -1530,6 +1526,9 @@ static int match_server(struct TCP_Server_Info *server,
if (server->nosharesock)
return 0;
+ if (!match_super && (ctx->dfs_conn || server->dfs_conn))
+ return 0;
+
/* If multidialect negotiation see if existing sessions match one */
if (strcmp(ctx->vals->version_string, SMB3ANY_VERSION_STRING) == 0) {
if (server->vals->protocol_id < SMB30_PROT_ID)
@@ -1723,6 +1722,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
if (ctx->nosharesock)
tcp_ses->nosharesock = true;
+ tcp_ses->dfs_conn = ctx->dfs_conn;
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
@@ -1873,13 +1873,15 @@ out_err:
}
/* this function must be called with ses_lock and chan_lock held */
-static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
+static int match_session(struct cifs_ses *ses,
+ struct smb3_fs_context *ctx,
+ bool match_super)
{
if (ctx->sectype != Unspecified &&
ctx->sectype != ses->sectype)
return 0;
- if (ctx->dfs_root_ses != ses->dfs_root_ses)
+ if (!match_super && ctx->dfs_root_ses != ses->dfs_root_ses)
return 0;
/*
@@ -1998,7 +2000,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
continue;
}
spin_lock(&ses->chan_lock);
- if (match_session(ses, ctx)) {
+ if (match_session(ses, ctx, false)) {
spin_unlock(&ses->chan_lock);
spin_unlock(&ses->ses_lock);
ret = ses;
@@ -2058,8 +2060,7 @@ void __cifs_put_smb_ses(struct cifs_ses *ses)
if (do_logoff) {
xid = get_xid();
rc = server->ops->logoff(xid, ses);
- if (rc)
- cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+ cifs_server_dbg(FYI, "%s: Session Logoff: rc=%d\n",
__func__, rc);
_free_xid(xid);
}
@@ -2382,8 +2383,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
* need to lock before changing something in the session.
*/
spin_lock(&cifs_tcp_ses_lock);
- if (ctx->dfs_root_ses)
- cifs_smb_ses_inc_refcount(ctx->dfs_root_ses);
ses->dfs_root_ses = ctx->dfs_root_ses;
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2458,6 +2457,7 @@ cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace)
{
unsigned int xid;
struct cifs_ses *ses;
+ LIST_HEAD(ses_list);
/*
* IPC tcon share the lifetime of their session and are
@@ -2482,6 +2482,9 @@ cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace)
list_del_init(&tcon->tcon_list);
tcon->status = TID_EXITING;
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ list_replace_init(&tcon->dfs_ses_list, &ses_list);
+#endif
spin_unlock(&tcon->tc_lock);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2509,6 +2512,9 @@ cifs_put_tcon(struct cifs_tcon *tcon, enum smb3_tcon_ref_trace trace)
cifs_fscache_release_super_cookie(tcon);
tconInfoFree(tcon, netfs_trace_tcon_ref_free);
cifs_put_smb_ses(ses);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_put_root_smb_sessions(&ses_list);
+#endif
}
/**
@@ -2892,7 +2898,7 @@ cifs_match_super(struct super_block *sb, void *data)
spin_lock(&ses->chan_lock);
spin_lock(&tcon->tc_lock);
if (!match_server(tcp_srv, ctx, true) ||
- !match_session(ses, ctx) ||
+ !match_session(ses, ctx, true) ||
!match_tcon(tcon, ctx) ||
!match_prepath(sb, tcon, mnt_data)) {
rc = 0;
@@ -3623,13 +3629,12 @@ out:
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
struct cifs_mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- bool isdfs;
int rc;
- rc = dfs_mount_share(&mnt_ctx, &isdfs);
+ rc = dfs_mount_share(&mnt_ctx);
if (rc)
goto error;
- if (!isdfs)
+ if (!ctx->dfs_conn)
goto out;
/*
@@ -4034,7 +4039,7 @@ cifs_set_vol_auth(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
static struct cifs_tcon *
-__cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
{
int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
@@ -4132,17 +4137,6 @@ out:
return tcon;
}
-static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
-{
- struct cifs_tcon *ret;
-
- cifs_mount_lock();
- ret = __cifs_construct_tcon(cifs_sb, fsuid);
- cifs_mount_unlock();
- return ret;
-}
-
struct cifs_tcon *
cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
{
@@ -4212,9 +4206,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
struct tcon_link *
cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
{
- int ret;
- kuid_t fsuid = current_fsuid();
struct tcon_link *tlink, *newtlink;
+ kuid_t fsuid = current_fsuid();
+ int err;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -4249,9 +4243,9 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
spin_unlock(&cifs_sb->tlink_tree_lock);
} else {
wait_for_construction:
- ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
+ err = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
TASK_INTERRUPTIBLE);
- if (ret) {
+ if (err) {
cifs_put_tlink(tlink);
return ERR_PTR(-ERESTARTSYS);
}
@@ -4262,8 +4256,9 @@ wait_for_construction:
/* return error if we tried this already recently */
if (time_before(jiffies, tlink->tl_time + TLINK_ERROR_EXPIRE)) {
+ err = PTR_ERR(tlink->tl_tcon);
cifs_put_tlink(tlink);
- return ERR_PTR(-EACCES);
+ return ERR_PTR(err);
}
if (test_and_set_bit(TCON_LINK_PENDING, &tlink->tl_flags))
@@ -4275,8 +4270,11 @@ wait_for_construction:
wake_up_bit(&tlink->tl_flags, TCON_LINK_PENDING);
if (IS_ERR(tlink->tl_tcon)) {
+ err = PTR_ERR(tlink->tl_tcon);
+ if (err == -ENOKEY)
+ err = -EACCES;
cifs_put_tlink(tlink);
- return ERR_PTR(-EACCES);
+ return ERR_PTR(err);
}
return tlink;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 3ec965547e3d..3f6077c68d68 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -69,7 +69,7 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
* Get an active reference of @ses so that next call to cifs_put_tcon() won't
* release it as any new DFS referrals must go through its IPC tcon.
*/
-static void add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
+static void set_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
struct cifs_ses *ses = mnt_ctx->ses;
@@ -95,7 +95,7 @@ static inline int parse_dfs_target(struct smb3_fs_context *ctx,
return rc;
}
-static int set_ref_paths(struct cifs_mount_ctx *mnt_ctx,
+static int setup_dfs_ref(struct cifs_mount_ctx *mnt_ctx,
struct dfs_info3_param *tgt,
struct dfs_ref_walk *rw)
{
@@ -120,6 +120,7 @@ static int set_ref_paths(struct cifs_mount_ctx *mnt_ctx,
}
ref_walk_path(rw) = ref_path;
ref_walk_fpath(rw) = full_path;
+ ref_walk_ses(rw) = ctx->dfs_root_ses;
return 0;
}
@@ -128,11 +129,11 @@ static int __dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx,
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
struct dfs_info3_param tgt = {};
- bool is_refsrv;
int rc = -ENOENT;
again:
do {
+ ctx->dfs_root_ses = ref_walk_ses(rw);
if (ref_walk_empty(rw)) {
rc = dfs_get_referral(mnt_ctx, ref_walk_path(rw) + 1,
NULL, ref_walk_tl(rw));
@@ -158,10 +159,7 @@ again:
if (rc)
continue;
- is_refsrv = tgt.server_type == DFS_TYPE_ROOT ||
- DFS_INTERLINK(tgt.flags);
ref_walk_set_tgt_hint(rw);
-
if (tgt.flags & DFSREF_STORAGE_SERVER) {
rc = cifs_mount_get_tcon(mnt_ctx);
if (!rc)
@@ -172,12 +170,10 @@ again:
continue;
}
- if (is_refsrv)
- add_root_smb_session(mnt_ctx);
-
+ set_root_smb_session(mnt_ctx);
rc = ref_walk_advance(rw);
if (!rc) {
- rc = set_ref_paths(mnt_ctx, &tgt, rw);
+ rc = setup_dfs_ref(mnt_ctx, &tgt, rw);
if (!rc) {
rc = -EREMOTE;
goto again;
@@ -193,20 +189,22 @@ out:
return rc;
}
-static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx)
+static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx,
+ struct dfs_ref_walk **rw)
{
- struct dfs_ref_walk *rw;
int rc;
- rw = ref_walk_alloc();
- if (IS_ERR(rw))
- return PTR_ERR(rw);
+ *rw = ref_walk_alloc();
+ if (IS_ERR(*rw)) {
+ rc = PTR_ERR(*rw);
+ *rw = NULL;
+ return rc;
+ }
- ref_walk_init(rw);
- rc = set_ref_paths(mnt_ctx, NULL, rw);
+ ref_walk_init(*rw);
+ rc = setup_dfs_ref(mnt_ctx, NULL, *rw);
if (!rc)
- rc = __dfs_referral_walk(mnt_ctx, rw);
- ref_walk_free(rw);
+ rc = __dfs_referral_walk(mnt_ctx, *rw);
return rc;
}
@@ -214,16 +212,16 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
{
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct dfs_ref_walk *rw = NULL;
struct cifs_tcon *tcon;
char *origin_fullpath;
- bool new_tcon = true;
int rc;
origin_fullpath = dfs_get_path(cifs_sb, ctx->source);
if (IS_ERR(origin_fullpath))
return PTR_ERR(origin_fullpath);
- rc = dfs_referral_walk(mnt_ctx);
+ rc = dfs_referral_walk(mnt_ctx, &rw);
if (!rc) {
/*
* Prevent superblock from being created with any missing
@@ -241,21 +239,16 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
tcon = mnt_ctx->tcon;
spin_lock(&tcon->tc_lock);
- if (!tcon->origin_fullpath) {
- tcon->origin_fullpath = origin_fullpath;
- origin_fullpath = NULL;
- } else {
- new_tcon = false;
- }
+ tcon->origin_fullpath = origin_fullpath;
+ origin_fullpath = NULL;
+ ref_walk_set_tcon(rw, tcon);
spin_unlock(&tcon->tc_lock);
-
- if (new_tcon) {
- queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
- dfs_cache_get_ttl() * HZ);
- }
+ queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
+ dfs_cache_get_ttl() * HZ);
out:
kfree(origin_fullpath);
+ ref_walk_free(rw);
return rc;
}
@@ -279,7 +272,7 @@ static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
return rc;
}
-int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
bool nodfs = ctx->nodfs;
@@ -289,7 +282,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
if (rc)
return rc;
- *isdfs = false;
rc = get_session(mnt_ctx, NULL);
if (rc)
return rc;
@@ -317,10 +309,15 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
return rc;
}
- *isdfs = true;
- add_root_smb_session(mnt_ctx);
- rc = __dfs_mount_share(mnt_ctx);
- dfs_put_root_smb_sessions(mnt_ctx);
+ if (!ctx->dfs_conn) {
+ ctx->dfs_conn = true;
+ cifs_mount_put_conns(mnt_ctx);
+ rc = get_session(mnt_ctx, NULL);
+ }
+ if (!rc) {
+ set_root_smb_session(mnt_ctx);
+ rc = __dfs_mount_share(mnt_ctx);
+ }
return rc;
}
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index e5c4dcf83750..1aa2bc65b3bc 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -19,6 +19,7 @@
struct dfs_ref {
char *path;
char *full_path;
+ struct cifs_ses *ses;
struct dfs_cache_tgt_list tl;
struct dfs_cache_tgt_iterator *tit;
};
@@ -38,6 +39,7 @@ struct dfs_ref_walk {
#define ref_walk_path(w) (ref_walk_cur(w)->path)
#define ref_walk_fpath(w) (ref_walk_cur(w)->full_path)
#define ref_walk_tl(w) (&ref_walk_cur(w)->tl)
+#define ref_walk_ses(w) (ref_walk_cur(w)->ses)
static inline struct dfs_ref_walk *ref_walk_alloc(void)
{
@@ -60,14 +62,19 @@ static inline void __ref_walk_free(struct dfs_ref *ref)
kfree(ref->path);
kfree(ref->full_path);
dfs_cache_free_tgts(&ref->tl);
+ if (ref->ses)
+ cifs_put_smb_ses(ref->ses);
memset(ref, 0, sizeof(*ref));
}
static inline void ref_walk_free(struct dfs_ref_walk *rw)
{
- struct dfs_ref *ref = ref_walk_start(rw);
+ struct dfs_ref *ref;
- for (; ref <= ref_walk_end(rw); ref++)
+ if (!rw)
+ return;
+
+ for (ref = ref_walk_start(rw); ref <= ref_walk_end(rw); ref++)
__ref_walk_free(ref);
kfree(rw);
}
@@ -116,9 +123,22 @@ static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw)
ref_walk_tit(rw));
}
+static inline void ref_walk_set_tcon(struct dfs_ref_walk *rw,
+ struct cifs_tcon *tcon)
+{
+ struct dfs_ref *ref = ref_walk_start(rw);
+
+ for (; ref <= ref_walk_cur(rw); ref++) {
+ if (WARN_ON_ONCE(!ref->ses))
+ continue;
+ list_add(&ref->ses->dlist, &tcon->dfs_ses_list);
+ ref->ses = NULL;
+ }
+}
+
int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
struct smb3_fs_context *ctx);
-int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs);
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx);
static inline char *dfs_get_path(struct cifs_sb_info *cifs_sb, const char *path)
{
@@ -142,20 +162,14 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p
* references of all DFS root sessions that were used across the mount process
* in dfs_mount_share().
*/
-static inline void dfs_put_root_smb_sessions(struct cifs_mount_ctx *mnt_ctx)
+static inline void dfs_put_root_smb_sessions(struct list_head *head)
{
- const struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct cifs_ses *ses = ctx->dfs_root_ses;
- struct cifs_ses *cur;
-
- if (!ses)
- return;
+ struct cifs_ses *ses, *n;
- for (cur = ses; cur; cur = cur->dfs_root_ses) {
- if (cur->dfs_root_ses)
- cifs_put_smb_ses(cur->dfs_root_ses);
+ list_for_each_entry_safe(ses, n, head, dlist) {
+ list_del_init(&ses->dlist);
+ cifs_put_smb_ses(ses);
}
- cifs_put_smb_ses(ses);
}
#endif /* _CIFS_DFS_H */
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 11c8efecf7aa..110f03df012a 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -126,6 +126,7 @@ static inline void free_tgts(struct cache_entry *ce)
static inline void flush_cache_ent(struct cache_entry *ce)
{
+ cifs_dbg(FYI, "%s: %s\n", __func__, ce->path);
hlist_del_init(&ce->hlist);
kfree(ce->path);
free_tgts(ce);
@@ -441,34 +442,31 @@ static struct cache_entry *alloc_cache_entry(struct dfs_info3_param *refs, int n
return ce;
}
-static void remove_oldest_entry_locked(void)
+/* Remove all referrals that have a single target or oldest entry */
+static void purge_cache(void)
{
int i;
struct cache_entry *ce;
- struct cache_entry *to_del = NULL;
-
- WARN_ON(!rwsem_is_locked(&htable_rw_lock));
+ struct cache_entry *oldest = NULL;
for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
struct hlist_head *l = &cache_htable[i];
+ struct hlist_node *n;
- hlist_for_each_entry(ce, l, hlist) {
+ hlist_for_each_entry_safe(ce, n, l, hlist) {
if (hlist_unhashed(&ce->hlist))
continue;
- if (!to_del || timespec64_compare(&ce->etime,
- &to_del->etime) < 0)
- to_del = ce;
+ if (ce->numtgts == 1)
+ flush_cache_ent(ce);
+ else if (!oldest ||
+ timespec64_compare(&ce->etime,
+ &oldest->etime) < 0)
+ oldest = ce;
}
}
- if (!to_del) {
- cifs_dbg(FYI, "%s: no entry to remove\n", __func__);
- return;
- }
-
- cifs_dbg(FYI, "%s: removing entry\n", __func__);
- dump_ce(to_del);
- flush_cache_ent(to_del);
+ if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES && oldest)
+ flush_cache_ent(oldest);
}
/* Add a new DFS cache entry */
@@ -484,7 +482,7 @@ static struct cache_entry *add_cache_entry_locked(struct dfs_info3_param *refs,
if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) {
cifs_dbg(FYI, "%s: reached max cache size (%d)\n", __func__, CACHE_MAX_ENTRIES);
- remove_oldest_entry_locked();
+ purge_cache();
}
rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash);
@@ -1095,16 +1093,18 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
return 0;
}
-static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, const char *s2)
+static bool target_share_equal(struct cifs_tcon *tcon, const char *s1)
{
- char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
+ struct TCP_Server_Info *server = tcon->ses->server;
+ struct sockaddr_storage ss;
const char *host;
+ const char *s2 = &tcon->tree_name[1];
size_t hostlen;
- struct sockaddr_storage ss;
+ char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
bool match;
int rc;
- if (strcasecmp(s1, s2))
+ if (strcasecmp(s2, s1))
return false;
/*
@@ -1128,34 +1128,6 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
return match;
}
-/*
- * Mark dfs tcon for reconnecting when the currently connected tcon does not match any of the new
- * target shares in @refs.
- */
-static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server,
- const char *path,
- struct dfs_cache_tgt_list *old_tl,
- struct dfs_cache_tgt_list *new_tl)
-{
- struct dfs_cache_tgt_iterator *oit, *nit;
-
- for (oit = dfs_cache_get_tgt_iterator(old_tl); oit;
- oit = dfs_cache_get_next_tgt(old_tl, oit)) {
- for (nit = dfs_cache_get_tgt_iterator(new_tl); nit;
- nit = dfs_cache_get_next_tgt(new_tl, nit)) {
- if (target_share_equal(server,
- dfs_cache_get_tgt_name(oit),
- dfs_cache_get_tgt_name(nit))) {
- dfs_cache_noreq_update_tgthint(path, nit);
- return;
- }
- }
- }
-
- cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- cifs_signal_cifsd_for_reconnect(server, true);
-}
-
static bool is_ses_good(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
@@ -1172,41 +1144,35 @@ static bool is_ses_good(struct cifs_ses *ses)
return ret;
}
-/* Refresh dfs referral of @ses and mark it for reconnect if needed */
-static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh)
+static char *get_ses_refpath(struct cifs_ses *ses)
{
struct TCP_Server_Info *server = ses->server;
- DFS_CACHE_TGT_LIST(old_tl);
- DFS_CACHE_TGT_LIST(new_tl);
- bool needs_refresh = false;
- struct cache_entry *ce;
- unsigned int xid;
- char *path = NULL;
- int rc = 0;
-
- xid = get_xid();
+ char *path = ERR_PTR(-ENOENT);
mutex_lock(&server->refpath_lock);
if (server->leaf_fullpath) {
path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC);
if (!path)
- rc = -ENOMEM;
+ path = ERR_PTR(-ENOMEM);
}
mutex_unlock(&server->refpath_lock);
- if (!path)
- goto out;
+ return path;
+}
- down_read(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
- if (!IS_ERR(ce)) {
- rc = get_targets(ce, &old_tl);
- cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc);
- }
- up_read(&htable_rw_lock);
+/* Refresh dfs referral of @ses */
+static void refresh_ses_referral(struct cifs_ses *ses)
+{
+ struct cache_entry *ce;
+ unsigned int xid;
+ char *path;
+ int rc = 0;
- if (!needs_refresh) {
- rc = 0;
+ xid = get_xid();
+
+ path = get_ses_refpath(ses);
+ if (IS_ERR(path)) {
+ rc = PTR_ERR(path);
+ path = NULL;
goto out;
}
@@ -1217,29 +1183,106 @@ static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh)
goto out;
}
- ce = cache_refresh_path(xid, ses, path, true);
- if (!IS_ERR(ce)) {
- rc = get_targets(ce, &new_tl);
+ ce = cache_refresh_path(xid, ses, path, false);
+ if (!IS_ERR(ce))
up_read(&htable_rw_lock);
- cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc);
- mark_for_reconnect_if_needed(server, path, &old_tl, &new_tl);
- }
+ else
+ rc = PTR_ERR(ce);
out:
free_xid(xid);
- dfs_cache_free_tgts(&old_tl);
- dfs_cache_free_tgts(&new_tl);
kfree(path);
}
-static inline void refresh_ses_referral(struct cifs_ses *ses)
+static int __refresh_tcon_referral(struct cifs_tcon *tcon,
+ const char *path,
+ struct dfs_info3_param *refs,
+ int numrefs, bool force_refresh)
{
- __refresh_ses_referral(ses, false);
+ struct cache_entry *ce;
+ bool reconnect = force_refresh;
+ int rc = 0;
+ int i;
+
+ if (unlikely(!numrefs))
+ return 0;
+
+ if (force_refresh) {
+ for (i = 0; i < numrefs; i++) {
+ /* TODO: include prefix paths in the matching */
+ if (target_share_equal(tcon, refs[i].node_name)) {
+ reconnect = false;
+ break;
+ }
+ }
+ }
+
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ if (!IS_ERR(ce)) {
+ if (force_refresh || cache_entry_expired(ce))
+ rc = update_cache_entry_locked(ce, refs, numrefs);
+ } else if (PTR_ERR(ce) == -ENOENT) {
+ ce = add_cache_entry_locked(refs, numrefs);
+ }
+ up_write(&htable_rw_lock);
+
+ if (IS_ERR(ce))
+ rc = PTR_ERR(ce);
+ if (reconnect) {
+ cifs_tcon_dbg(FYI, "%s: mark for reconnect\n", __func__);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
+ }
+ return rc;
}
-static inline void force_refresh_ses_referral(struct cifs_ses *ses)
+static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
{
- __refresh_ses_referral(ses, true);
+ struct dfs_info3_param *refs = NULL;
+ struct cache_entry *ce;
+ struct cifs_ses *ses;
+ unsigned int xid;
+ bool needs_refresh;
+ char *path;
+ int numrefs = 0;
+ int rc = 0;
+
+ xid = get_xid();
+ ses = tcon->ses;
+
+ path = get_ses_refpath(ses);
+ if (IS_ERR(path)) {
+ rc = PTR_ERR(path);
+ path = NULL;
+ goto out;
+ }
+
+ down_read(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
+ if (!needs_refresh) {
+ up_read(&htable_rw_lock);
+ goto out;
+ }
+ up_read(&htable_rw_lock);
+
+ ses = CIFS_DFS_ROOT_SES(ses);
+ if (!is_ses_good(ses)) {
+ cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n",
+ __func__);
+ goto out;
+ }
+
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
+ if (!rc) {
+ rc = __refresh_tcon_referral(tcon, path, refs,
+ numrefs, force_refresh);
+ }
+
+out:
+ free_xid(xid);
+ kfree(path);
+ free_dfs_info_array(refs, numrefs);
}
/**
@@ -1280,7 +1323,7 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
*/
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- force_refresh_ses_referral(tcon->ses);
+ refresh_tcon_referral(tcon, true);
return 0;
}
@@ -1292,8 +1335,9 @@ void dfs_cache_refresh(struct work_struct *work)
tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);
- for (ses = tcon->ses; ses; ses = ses->dfs_root_ses)
+ list_for_each_entry(ses, &tcon->dfs_ses_list, dlist)
refresh_ses_referral(ses);
+ refresh_tcon_referral(tcon, false);
queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
atomic_read(&dfs_cache_ttl) * HZ);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index cf577ec0dd0a..69f9d938b336 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -284,6 +284,7 @@ struct smb3_fs_context {
struct cifs_ses *dfs_root_ses;
bool dfs_automount:1; /* set for dfs automount only */
enum cifs_reparse_type reparse_type;
+ bool dfs_conn:1; /* set for dfs mounts */
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 331a86074ae7..647f9bedd9fc 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -834,10 +834,6 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_mode = S_IFREG | cifs_sb->ctx->file_mode;
fattr->cf_dtype = DT_REG;
- /* clear write bits if ATTR_READONLY is set */
- if (fattr->cf_cifsattrs & ATTR_READONLY)
- fattr->cf_mode &= ~(S_IWUGO);
-
/*
* Don't accept zero nlink from non-unix servers unless
* delete is pending. Instead mark it as unknown.
@@ -850,6 +846,10 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
}
}
+ /* clear write bits if ATTR_READONLY is set */
+ if (fattr->cf_cifsattrs & ATTR_READONLY)
+ fattr->cf_mode &= ~(S_IWUGO);
+
out_reparse:
if (S_ISLNK(fattr->cf_mode)) {
if (likely(data->symlink_target))
@@ -1267,11 +1267,14 @@ handle_mnt_opt:
__func__, rc);
goto out;
}
- }
-
- /* fill in remaining high mode bits e.g. SUID, VTX */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+ } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+ /* fill in remaining high mode bits e.g. SUID, VTX */
cifs_sfu_mode(fattr, full_path, cifs_sb, xid);
+ else if (!(tcon->posix_extensions))
+ /* clear write bits if ATTR_READONLY is set */
+ if (fattr->cf_cifsattrs & ATTR_READONLY)
+ fattr->cf_mode &= ~(S_IWUGO);
+
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 9bb5c869f4db..2ce193609d8b 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -90,23 +90,23 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
}
src_file = fdget(srcfd);
- if (!src_file.file) {
+ if (!fd_file(src_file)) {
rc = -EBADF;
goto out_drop_write;
}
- if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
rc = -EBADF;
cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
goto out_fput;
}
- src_inode = file_inode(src_file.file);
+ src_inode = file_inode(fd_file(src_file));
rc = -EINVAL;
if (S_ISDIR(src_inode->i_mode))
goto out_fput;
- rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
+ rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
src_inode->i_size, 0);
if (rc > 0)
rc = 0;
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index dab526191b07..054f10ebf65a 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -145,6 +145,9 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
mutex_init(&ret_buf->fscache_lock);
#endif
trace_smb3_tcon_ref(ret_buf->debug_id, ret_buf->tc_count, trace);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
+#endif
return ret_buf;
}
@@ -1108,7 +1111,8 @@ static void tcon_super_cb(struct super_block *sb, void *arg)
t2 = cifs_sb_master_tcon(cifs_sb);
spin_lock(&t2->tc_lock);
- if (t1->ses == t2->ses &&
+ if ((t1->ses == t2->ses ||
+ t1->ses->dfs_root_ses == t2->ses->dfs_root_ses) &&
t1->ses->server == t2->ses->server &&
t2->origin_fullpath &&
dfs_src_pathname_equal(t2->origin_fullpath, t1->origin_fullpath))
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index 4a517b280f2b..0f788031b740 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -240,7 +240,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
ctx->source = NULL;
goto out;
}
- ctx->dfs_automount = is_dfs_mount(mntpt);
+ ctx->dfs_automount = ctx->dfs_conn = is_dfs_mount(mntpt);
cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dfs_automount=%d\n",
__func__, ctx->source, ctx->UNC, ctx->prepath, ctx->dfs_automount);
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 48c27581ec51..3b48a093cfb1 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -108,8 +108,8 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
buf->InodeType = cpu_to_le64(type);
buf->ReparseDataLength = cpu_to_le16(len + dlen -
sizeof(struct reparse_data_buffer));
- *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
- MINOR(dev));
+ *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MINOR(dev) << 32) |
+ MAJOR(dev));
iov->iov_base = buf;
iov->iov_len = len + dlen;
return 0;
@@ -468,7 +468,7 @@ static void wsl_to_fattr(struct cifs_open_info_data *data,
else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
- fattr->cf_rdev = wsl_mkdev(v);
+ fattr->cf_rdev = reparse_mkdev(v);
} while (next);
out:
fattr->cf_dtype = S_DT(fattr->cf_mode);
@@ -485,11 +485,11 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
switch (le64_to_cpu(buf->InodeType)) {
case NFS_SPECFILE_CHR:
fattr->cf_mode |= S_IFCHR;
- fattr->cf_rdev = reparse_nfs_mkdev(buf);
+ fattr->cf_rdev = reparse_mkdev(buf->DataBuffer);
break;
case NFS_SPECFILE_BLK:
fattr->cf_mode |= S_IFBLK;
- fattr->cf_rdev = reparse_nfs_mkdev(buf);
+ fattr->cf_rdev = reparse_mkdev(buf->DataBuffer);
break;
case NFS_SPECFILE_FIFO:
fattr->cf_mode |= S_IFIFO;
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index 2c0644bc4e65..158e7b7aae64 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -18,14 +18,7 @@
*/
#define IO_REPARSE_TAG_INTERNAL ((__u32)~0U)
-static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
-{
- u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
-
- return MKDEV(v >> 32, v & 0xffffffff);
-}
-
-static inline dev_t wsl_mkdev(void *ptr)
+static inline dev_t reparse_mkdev(void *ptr)
{
u64 v = le64_to_cpu(*(__le64 *)ptr);
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 3216f786908f..03c0b484a4b5 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -624,7 +624,7 @@ cifs_ses_add_channel(struct cifs_ses *ses,
* to sign packets before we generate the channel signing key
* (we sign with the session key)
*/
- rc = smb311_crypto_shash_allocate(chan->server);
+ rc = smb3_crypto_shash_allocate(chan->server);
if (rc) {
cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
mutex_unlock(&ses->session_mutex);
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index f3c4b70b77b9..bdeb12ff53e3 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -906,41 +906,41 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
|| (hdr->Status !=
cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))))
return 0;
-
ok:
- rc = smb311_crypto_shash_allocate(server);
- if (rc)
+ rc = cifs_alloc_hash("sha512", &sha512);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not allocate SHA512 shash, rc=%d\n", __func__, rc);
return rc;
+ }
- sha512 = server->secmech.sha512;
rc = crypto_shash_init(sha512);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not init SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
rc = crypto_shash_update(sha512, ses->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
for (i = 0; i < nvec; i++) {
rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n",
- __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
}
rc = crypto_shash_final(sha512, ses->preauth_sha_hash);
if (rc) {
- cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n",
- __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not finalize SHA12 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
+err_free:
+ cifs_free_hash(&sha512);
return 0;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 7381ec333c6d..177173072bfa 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4309,7 +4309,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
*/
static int
crypt_message(struct TCP_Server_Info *server, int num_rqst,
- struct smb_rqst *rqst, int enc)
+ struct smb_rqst *rqst, int enc, struct crypto_aead *tfm)
{
struct smb2_transform_hdr *tr_hdr =
(struct smb2_transform_hdr *)rqst[0].rq_iov[0].iov_base;
@@ -4320,8 +4320,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
u8 *iv;
- DECLARE_CRYPTO_WAIT(wait);
- struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
size_t sensitive_size;
@@ -4333,14 +4331,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- rc = smb3_crypto_aead_allocate(server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__);
- return rc;
- }
-
- tfm = enc ? server->secmech.enc : server->secmech.dec;
-
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
@@ -4380,11 +4370,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
- aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &wait);
-
- rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
- : crypto_aead_decrypt(req), &wait);
+ rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
@@ -4526,7 +4512,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
/* fill the 1st iov with a transform header */
fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type);
- rc = crypt_message(server, num_rqst, new_rq, 1);
+ rc = crypt_message(server, num_rqst, new_rq, 1, server->secmech.enc);
cifs_dbg(FYI, "Encrypt message returned %d\n", rc);
if (rc)
goto err_free;
@@ -4551,8 +4537,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
unsigned int buf_data_size, struct iov_iter *iter,
bool is_offloaded)
{
- struct kvec iov[2];
+ struct crypto_aead *tfm;
struct smb_rqst rqst = {NULL};
+ struct kvec iov[2];
size_t iter_size = 0;
int rc;
@@ -4568,9 +4555,31 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
iter_size = iov_iter_count(iter);
}
- rc = crypt_message(server, 1, &rqst, 0);
+ if (is_offloaded) {
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
+ tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ else
+ tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+ if (IS_ERR(tfm)) {
+ rc = PTR_ERR(tfm);
+ cifs_server_dbg(VFS, "%s: Failed alloc decrypt TFM, rc=%d\n", __func__, rc);
+
+ return rc;
+ }
+ } else {
+ if (unlikely(!server->secmech.dec))
+ return -EIO;
+
+ tfm = server->secmech.dec;
+ }
+
+ rc = crypt_message(server, 1, &rqst, 0, tfm);
cifs_dbg(FYI, "Decrypt message returned %d\n", rc);
+ if (is_offloaded)
+ crypto_free_aead(tfm);
+
if (rc)
return rc;
@@ -4869,9 +4878,12 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
goto discard_data;
server->total_read += rc;
- if (rc < len)
- iov_iter_zero(len - rc, &iter);
- iov_iter_revert(&iter, len);
+ if (rc < len) {
+ struct iov_iter tmp = iter;
+
+ iov_iter_advance(&tmp, rc);
+ iov_iter_zero(len - rc, &tmp);
+ }
iov_iter_truncate(&iter, dw->len);
rc = cifs_discard_remaining_data(server);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2cb1bf65a172..02828b9c3cb3 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1266,6 +1266,16 @@ SMB2_negotiate(const unsigned int xid,
else
cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
}
+
+ if (server->cipher_type && !rc) {
+ if (!SERVER_IS_CHAN(server)) {
+ rc = smb3_crypto_aead_allocate(server);
+ } else {
+ /* For channels, just reuse the primary server crypto secmech. */
+ server->secmech.enc = server->primary_server->secmech.enc;
+ server->secmech.dec = server->primary_server->secmech.dec;
+ }
+ }
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -4866,7 +4876,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
#endif
if (result) {
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
- trace_smb3_write_err(wdata->xid,
+ trace_smb3_write_err(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ wdata->xid,
wdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid, wdata->subreq.start,
wdata->subreq.len, wdata->result);
@@ -4874,7 +4886,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
pr_warn_once("Out of space writing to %s\n",
tcon->tree_name);
} else
- trace_smb3_write_done(0 /* no xid */,
+ trace_smb3_write_done(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ wdata->xid,
wdata->req->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid,
wdata->subreq.start, wdata->subreq.len);
@@ -4952,7 +4966,9 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
- trace_smb3_write_enter(wdata->xid,
+ trace_smb3_write_enter(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ wdata->xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
@@ -5032,7 +5048,9 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
wdata, flags, &wdata->credits);
/* Can't touch wdata if rc == 0 */
if (rc) {
- trace_smb3_write_err(xid,
+ trace_smb3_write_err(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ xid,
io_parms->persistent_fid,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
@@ -5112,7 +5130,7 @@ replay_again:
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
- trace_smb3_write_enter(xid, io_parms->persistent_fid,
+ trace_smb3_write_enter(0, 0, xid, io_parms->persistent_fid,
io_parms->tcon->tid, io_parms->tcon->ses->Suid,
io_parms->offset, io_parms->length);
@@ -5133,7 +5151,7 @@ replay_again:
rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
if (rc) {
- trace_smb3_write_err(xid,
+ trace_smb3_write_err(0, 0, xid,
req->PersistentFileId,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
@@ -5142,7 +5160,7 @@ replay_again:
cifs_dbg(VFS, "Send error in write = %d\n", rc);
} else {
*nbytes = le32_to_cpu(rsp->DataLength);
- trace_smb3_write_done(xid,
+ trace_smb3_write_done(0, 0, xid,
req->PersistentFileId,
io_parms->tcon->tid,
io_parms->tcon->ses->Suid,
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index c7e1b149877a..56a896ff7cd9 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -291,7 +291,7 @@ extern int smb2_validate_and_copy_iov(unsigned int offset,
extern void smb2_copy_fs_info_to_kstatfs(
struct smb2_fs_full_size_info *pfs_inf,
struct kstatfs *kst);
-extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
+extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server);
extern int smb311_update_preauth_hash(struct cifs_ses *ses,
struct TCP_Server_Info *server,
struct kvec *iov, int nvec);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index e4636fca821d..f7e04c40d22e 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -26,8 +26,7 @@
#include "../common/smb2status.h"
#include "smb2glob.h"
-static int
-smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+int smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
int rc;
@@ -46,33 +45,6 @@ err:
return rc;
}
-int
-smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- struct cifs_secmech *p = &server->secmech;
- int rc = 0;
-
- rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
- if (rc)
- return rc;
-
- rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
- if (rc)
- goto err;
-
- rc = cifs_alloc_hash("sha512", &p->sha512);
- if (rc)
- goto err;
-
- return 0;
-
-err:
- cifs_free_hash(&p->aes_cmac);
- cifs_free_hash(&p->hmacsha256);
- return rc;
-}
-
-
static
int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
{
@@ -242,7 +214,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
if (unlikely(!ses)) {
- cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
+ cifs_server_dbg(FYI, "%s: Could not find session\n", __func__);
return -ENOENT;
}
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 8e9964001e2a..0b52d22a91a0 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -157,6 +157,7 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \
TP_ARGS(rreq_debug_id, rreq_debug_index, xid, fid, tid, sesid, offset, len, rc))
DEFINE_SMB3_RW_ERR_EVENT(read_err);
+DEFINE_SMB3_RW_ERR_EVENT(write_err);
/* For logging errors in other file I/O ops */
DECLARE_EVENT_CLASS(smb3_other_err_class,
@@ -202,7 +203,6 @@ DEFINE_EVENT(smb3_other_err_class, smb3_##name, \
int rc), \
TP_ARGS(xid, fid, tid, sesid, offset, len, rc))
-DEFINE_SMB3_OTHER_ERR_EVENT(write_err);
DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err);
DEFINE_SMB3_OTHER_ERR_EVENT(zero_err);
DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err);
@@ -370,6 +370,8 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \
DEFINE_SMB3_RW_DONE_EVENT(read_enter);
DEFINE_SMB3_RW_DONE_EVENT(read_done);
+DEFINE_SMB3_RW_DONE_EVENT(write_enter);
+DEFINE_SMB3_RW_DONE_EVENT(write_done);
/* For logging successful other op */
DECLARE_EVENT_CLASS(smb3_other_done_class,
@@ -411,11 +413,9 @@ DEFINE_EVENT(smb3_other_done_class, smb3_##name, \
__u32 len), \
TP_ARGS(xid, fid, tid, sesid, offset, len))
-DEFINE_SMB3_OTHER_DONE_EVENT(write_enter);
DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_enter);
DEFINE_SMB3_OTHER_DONE_EVENT(zero_enter);
DEFINE_SMB3_OTHER_DONE_EVENT(falloc_enter);
-DEFINE_SMB3_OTHER_DONE_EVENT(write_done);
DEFINE_SMB3_OTHER_DONE_EVENT(query_dir_done);
DEFINE_SMB3_OTHER_DONE_EVENT(zero_done);
DEFINE_SMB3_OTHER_DONE_EVENT(falloc_done);
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index fd5a85d43759..91812150186c 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1817,11 +1817,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
length = data_len; /* An RDMA read is already done. */
else
#endif
- {
length = cifs_read_iter_from_socket(server, &rdata->subreq.io_iter,
data_len);
- iov_iter_revert(&rdata->subreq.io_iter, data_len);
- }
if (length > 0)
rdata->got_bytes += length;
server->total_read += length;
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c769f9dbc0b4..9f272cc8f566 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -6,7 +6,7 @@
* Note that, due to trying to use names similar to the protocol specifications,
* there are many mixed case field names in the structures below. Although
* this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
+ * able to match against the protocol specification.
*
* SMB2 commands
* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
@@ -491,7 +491,7 @@ struct smb2_encryption_neg_context {
__le16 ContextType; /* 2 */
__le16 DataLength;
__le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ /* CipherCount usually 2, but can be 3 when AES256-GCM enabled */
__le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
__le16 Ciphers[];
} __packed;
@@ -1061,7 +1061,7 @@ struct smb2_server_client_notification {
#define IL_IMPERSONATION cpu_to_le32(0x00000002)
#define IL_DELEGATE cpu_to_le32(0x00000003)
-/* File Attrubutes */
+/* File Attributes */
#define FILE_ATTRIBUTE_READONLY 0x00000001
#define FILE_ATTRIBUTE_HIDDEN 0x00000002
#define FILE_ATTRIBUTE_SYSTEM 0x00000004
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index cac80e7bfefc..aa2a37a7ce84 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -25,7 +25,7 @@ DECLARE_RWSEM(conn_list_lock);
/**
* ksmbd_conn_free() - free resources of the connection instance
*
- * @conn: connection instance to be cleand up
+ * @conn: connection instance to be cleaned up
*
* During the thread termination, the corresponding conn instance
* resources(sock/memory) are released and finally the conn object is freed.
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index f4e55199938d..38e6fd2da3b8 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -213,7 +213,7 @@ struct ksmbd_tree_connect_response {
};
/*
- * IPC Request struture to disconnect tree connection.
+ * IPC Request structure to disconnect tree connection.
*/
struct ksmbd_tree_disconnect_request {
__u64 session_id; /* session id */
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 246cde380dfb..4142c7ad5fa9 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -796,7 +796,7 @@ out:
/**
* smb2_lease_break_noti() - break lease when a new client request
* write lease
- * @opinfo: conains lease state information
+ * @opinfo: contains lease state information
*
* Return: 0 on success, otherwise error
*/
@@ -1484,7 +1484,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
}
/**
- * parse_lease_state() - parse lease context containted in file open request
+ * parse_lease_state() - parse lease context contained in file open request
* @open_req: buffer containing smb2 file open(create) request
*
* Return: allocated lease context object on success, otherwise NULL
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index c402d4abe826..231d2d224656 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -279,7 +279,7 @@ static void handle_ksmbd_work(struct work_struct *wk)
/**
* queue_ksmbd_work() - queue a smb request to worker thread queue
- * for proccessing smb command and sending response
+ * for processing smb command and sending response
* @conn: connection instance
*
* read remaining data from socket create and submit work.
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index e6bdc1b20727..7460089c186f 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1335,8 +1335,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
return rc;
sz = le16_to_cpu(rsp->SecurityBufferOffset);
- chgblob =
- (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz);
+ chgblob = (struct challenge_message *)rsp->Buffer;
memset(chgblob, 0, sizeof(struct challenge_message));
if (!work->conn->use_spnego) {
@@ -1369,9 +1368,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
goto out;
}
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len,
- /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
out:
@@ -1453,10 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work,
if (rc)
return -ENOMEM;
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob,
- spnego_blob_len,
- /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
}
@@ -2058,18 +2052,20 @@ out_err1:
* @access: file access flags
* @disposition: file disposition flags
* @may_flags: set with MAY_ flags
- * @is_dir: is creating open flags for directory
+ * @coptions: file creation options
+ * @mode: file mode
*
* Return: file open flags
*/
static int smb2_create_open_flags(bool file_present, __le32 access,
__le32 disposition,
int *may_flags,
- bool is_dir)
+ __le32 coptions,
+ umode_t mode)
{
int oflags = O_NONBLOCK | O_LARGEFILE;
- if (is_dir) {
+ if (coptions & FILE_DIRECTORY_FILE_LE || S_ISDIR(mode)) {
access &= ~FILE_WRITE_DESIRE_ACCESS_LE;
ksmbd_debug(SMB, "Discard write access to a directory\n");
}
@@ -2086,7 +2082,7 @@ static int smb2_create_open_flags(bool file_present, __le32 access,
*may_flags = MAY_OPEN | MAY_READ;
}
- if (access == FILE_READ_ATTRIBUTES_LE)
+ if (access == FILE_READ_ATTRIBUTES_LE || S_ISBLK(mode) || S_ISCHR(mode))
oflags |= O_PATH;
if (file_present) {
@@ -3181,8 +3177,8 @@ int smb2_open(struct ksmbd_work *work)
open_flags = smb2_create_open_flags(file_present, daccess,
req->CreateDisposition,
&may_flags,
- req->CreateOptions & FILE_DIRECTORY_FILE_LE ||
- (file_present && S_ISDIR(d_inode(path.dentry)->i_mode)));
+ req->CreateOptions,
+ file_present ? d_inode(path.dentry)->i_mode : 0);
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
if (open_flags & (O_CREAT | O_TRUNC)) {
@@ -3531,8 +3527,9 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->create_guid, dh_info.CreateGuid,
SMB2_CREATE_GUID_SIZE);
if (dh_info.timeout)
- fp->durable_timeout = min(dh_info.timeout,
- DURABLE_HANDLE_MAX_TIMEOUT);
+ fp->durable_timeout =
+ min_t(unsigned int, dh_info.timeout,
+ DURABLE_HANDLE_MAX_TIMEOUT);
else
fp->durable_timeout = 60;
}
@@ -4586,7 +4583,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
path = &fp->filp->f_path;
/* single EA entry is requested with given user.* name */
if (req->InputBufferLength) {
- if (le32_to_cpu(req->InputBufferLength) <
+ if (le32_to_cpu(req->InputBufferLength) <=
sizeof(struct smb2_ea_info_req))
return -EINVAL;
@@ -8090,7 +8087,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- if (in_buf_len < sizeof(struct copychunk_ioctl_req)) {
+ if (in_buf_len <= sizeof(struct copychunk_ioctl_req)) {
ret = -EINVAL;
goto out;
}
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 3be7d5ae65a8..73aff20e22d0 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -194,7 +194,7 @@ struct copychunk_ioctl_req {
__le64 ResumeKey[3];
__le32 ChunkCount;
__le32 Reserved;
- __u8 Chunks[1]; /* array of srv_copychunk */
+ __u8 Chunks[]; /* array of srv_copychunk */
} __packed;
struct srv_copychunk {
@@ -370,7 +370,7 @@ struct smb2_file_attr_tag_info {
struct smb2_ea_info_req {
__le32 NextEntryOffset;
__u8 EaNameLength;
- char name[1];
+ char name[];
} __packed; /* level 15 Query */
struct smb2_ea_info {
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index cc4bb2377cbd..5b8d75e78ffb 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -488,7 +488,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
* @shortname: destination short filename
*
* Return: shortname length or 0 when source long name is '.' or '..'
- * TODO: Though this function comforms the restriction of 8.3 Filename spec,
+ * TODO: Though this function conforms the restriction of 8.3 Filename spec,
* but the result is different with Windows 7's one. need to check.
*/
int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index b0f6d0f94cb8..5bbb179736c2 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -100,8 +100,8 @@ struct ksmbd_file {
struct list_head blocked_works;
struct list_head lock_list;
- int durable_timeout;
- int durable_scavenger_timeout;
+ unsigned int durable_timeout;
+ unsigned int durable_scavenger_timeout;
/* if ls is happening on directory, below is valid*/
struct ksmbd_readdir_data readdir_data;
diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h
index fa3e27d6971b..505101a8104c 100644
--- a/fs/smb/server/xattr.h
+++ b/fs/smb/server/xattr.h
@@ -99,7 +99,7 @@ struct xattr_ntacl {
__u8 posix_acl_hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for posix acl */
};
-/* DOS ATTRIBUITE XATTR PREFIX */
+/* DOS ATTRIBUTE XATTR PREFIX */
#define DOS_ATTRIBUTE_PREFIX "DOSATTRIB"
#define DOS_ATTRIBUTE_PREFIX_LEN (sizeof(DOS_ATTRIBUTE_PREFIX) - 1)
#define XATTR_NAME_DOS_ATTRIBUTE (XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX)
diff --git a/fs/splice.c b/fs/splice.c
index 60aed8de21f8..06232d7e505f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1566,11 +1566,11 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
static int vmsplice_type(struct fd f, int *type)
{
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (f.file->f_mode & FMODE_WRITE) {
+ if (fd_file(f)->f_mode & FMODE_WRITE) {
*type = ITER_SOURCE;
- } else if (f.file->f_mode & FMODE_READ) {
+ } else if (fd_file(f)->f_mode & FMODE_READ) {
*type = ITER_DEST;
} else {
fdput(f);
@@ -1621,9 +1621,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
if (!iov_iter_count(&iter))
error = 0;
else if (type == ITER_SOURCE)
- error = vmsplice_to_pipe(f.file, &iter, flags);
+ error = vmsplice_to_pipe(fd_file(f), &iter, flags);
else
- error = vmsplice_to_user(f.file, &iter, flags);
+ error = vmsplice_to_user(fd_file(f), &iter, flags);
kfree(iov);
out_fdput:
@@ -1646,10 +1646,10 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
error = -EBADF;
in = fdget(fd_in);
- if (in.file) {
+ if (fd_file(in)) {
out = fdget(fd_out);
- if (out.file) {
- error = __do_splice(in.file, off_in, out.file, off_out,
+ if (fd_file(out)) {
+ error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
len, flags);
fdput(out);
}
@@ -2016,10 +2016,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
error = -EBADF;
in = fdget(fdin);
- if (in.file) {
+ if (fd_file(in)) {
out = fdget(fdout);
- if (out.file) {
- error = do_tee(in.file, out.file, len, flags);
+ if (fd_file(out)) {
+ error = do_tee(fd_file(in), fd_file(out), len, flags);
fdput(out);
}
fdput(in);
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index 8a218e7c2390..e4d7e507b268 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -46,7 +46,7 @@ static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
}
kfree(comp_opts);
- return (__force void *) percpu;
+ return (void *)(__force unsigned long) percpu;
out:
for_each_possible_cpu(cpu) {
@@ -61,7 +61,7 @@ out:
static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream __percpu *percpu =
- (struct squashfs_stream __percpu *) msblk->stream;
+ (void __percpu *)(unsigned long) msblk->stream;
struct squashfs_stream *stream;
int cpu;
@@ -79,7 +79,7 @@ static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
{
struct squashfs_stream *stream;
struct squashfs_stream __percpu *percpu =
- (struct squashfs_stream __percpu *) msblk->stream;
+ (void __percpu *)(unsigned long) msblk->stream;
int res;
local_lock(&percpu->lock);
diff --git a/fs/stat.c b/fs/stat.c
index 89ce1be56310..41e598376d7e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -224,9 +224,9 @@ int vfs_fstat(int fd, struct kstat *stat)
int error;
f = fdget_raw(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0);
+ error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
fdput(f);
return error;
}
@@ -277,9 +277,9 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
u32 request_mask)
{
CLASS(fd_raw, f)(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
+ return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}
/**
diff --git a/fs/statfs.c b/fs/statfs.c
index 96d1c3edf289..9c7bb27e7932 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -116,8 +116,8 @@ int fd_statfs(int fd, struct kstatfs *st)
{
struct fd f = fdget_raw(fd);
int error = -EBADF;
- if (f.file) {
- error = vfs_statfs(&f.file->f_path, st);
+ if (fd_file(f)) {
+ error = vfs_statfs(&fd_file(f)->f_path, st);
fdput(f);
}
return error;
diff --git a/fs/sync.c b/fs/sync.c
index dc725914e1ed..67df255eb189 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -152,15 +152,15 @@ SYSCALL_DEFINE1(syncfs, int, fd)
struct super_block *sb;
int ret, ret2;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- sb = f.file->f_path.dentry->d_sb;
+ sb = fd_file(f)->f_path.dentry->d_sb;
down_read(&sb->s_umount);
ret = sync_filesystem(sb);
up_read(&sb->s_umount);
- ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
+ ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err);
fdput(f);
return ret ? ret : ret2;
@@ -208,8 +208,8 @@ static int do_fsync(unsigned int fd, int datasync)
struct fd f = fdget(fd);
int ret = -EBADF;
- if (f.file) {
- ret = vfs_fsync(f.file, datasync);
+ if (fd_file(f)) {
+ ret = vfs_fsync(fd_file(f), datasync);
fdput(f);
}
return ret;
@@ -360,8 +360,8 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
ret = -EBADF;
f = fdget(fd);
- if (f.file)
- ret = sync_file_range(f.file, offset, nbytes, flags);
+ if (fd_file(f))
+ ret = sync_file_range(fd_file(f), offset, nbytes, flags);
fdput(f);
return ret;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 4bf2f8bfec11..137523e0bb21 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -397,9 +397,9 @@ static const struct file_operations timerfd_fops = {
static int timerfd_fget(int fd, struct fd *p)
{
struct fd f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (f.file->f_op != &timerfd_fops) {
+ if (fd_file(f)->f_op != &timerfd_fops) {
fdput(f);
return -EINVAL;
}
@@ -482,7 +482,7 @@ static int do_timerfd_settime(int ufd, int flags,
ret = timerfd_fget(ufd, &f);
if (ret)
return ret;
- ctx = f.file->private_data;
+ ctx = fd_file(f)->private_data;
if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
fdput(f);
@@ -546,7 +546,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
int ret = timerfd_fget(ufd, &f);
if (ret)
return ret;
- ctx = f.file->private_data;
+ ctx = fd_file(f)->private_data;
spin_lock_irq(&ctx->wqh.lock);
if (ctx->expired && ctx->tintv) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d91cec93d968..5cc69beaa62e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2807,7 +2807,6 @@ static const struct file_operations dfs_fops = {
.read = dfs_file_read,
.write = dfs_file_write,
.owner = THIS_MODULE,
- .llseek = no_llseek,
};
/**
@@ -2952,7 +2951,6 @@ static const struct file_operations dfs_global_fops = {
.read = dfs_global_file_read,
.write = dfs_global_file_write,
.owner = THIS_MODULE,
- .llseek = no_llseek,
};
/**
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 27a3e9285fbf..68cdd89c97a3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -104,21 +104,6 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}
-static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
- vm_flags_t flags)
-{
- const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
-
- vm_flags_reset(vma, flags);
- /*
- * For shared mappings, we want to enable writenotify while
- * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
- * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
- */
- if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
- vma_set_page_prot(vma);
-}
-
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -386,15 +371,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
unsigned int blocking_state;
/*
- * We don't do userfault handling for the final child pid update.
- *
- * We also don't do userfault handling during
- * coredumping. hugetlbfs has the special
- * hugetlb_follow_page_mask() to skip missing pages in the
- * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
- * the no_page_table() helper in follow_page_mask(), but the
- * shmem_vm_ops->fault method is invoked even during
- * coredumping and it ends up here.
+ * We don't do userfault handling for the final child pid update
+ * and when coredumping (faults triggered by get_dump_page()).
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
@@ -615,22 +593,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) {
- struct vm_area_struct *vma;
- struct mm_struct *mm = release_new_ctx->mm;
- VMA_ITERATOR(vmi, mm, 0);
-
- /* the various vma->vm_userfaultfd_ctx still points to it */
- mmap_write_lock(mm);
- for_each_vma(vmi, vma) {
- if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma,
- vma->vm_flags & ~__VM_UFFD_FLAGS);
- }
- }
- mmap_write_unlock(mm);
-
+ userfaultfd_release_new(release_new_ctx);
userfaultfd_ctx_put(release_new_ctx);
}
@@ -662,9 +625,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
return 0;
if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
return 0;
}
@@ -749,9 +710,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
- vma_start_write(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ userfaultfd_reset_ctx(vma);
}
}
@@ -870,54 +829,14 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
- unsigned long new_flags;
- VMA_ITERATOR(vmi, mm, 0);
WRITE_ONCE(ctx->released, true);
- if (!mmget_not_zero(mm))
- goto wakeup;
+ userfaultfd_release_all(mm, ctx);
/*
- * Flush page faults out of all CPUs. NOTE: all page faults
- * must be retried without returning VM_FAULT_SIGBUS if
- * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
- * changes while handle_userfault released the mmap_lock. So
- * it's critical that released is set to true (above), before
- * taking the mmap_lock for writing.
- */
- mmap_write_lock(mm);
- prev = NULL;
- for_each_vma(vmi, vma) {
- cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
- !!(vma->vm_flags & __VM_UFFD_FLAGS));
- if (vma->vm_userfaultfd_ctx.ctx != ctx) {
- prev = vma;
- continue;
- }
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, vma->vm_start,
- vma->vm_end - vma->vm_start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
- vma->vm_end, new_flags,
- NULL_VM_UFFD_CTX);
-
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
- prev = vma;
- }
- mmap_write_unlock(mm);
- mmput(mm);
-wakeup:
- /*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
* the fault_*wqh.
@@ -1293,14 +1212,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
- struct vm_area_struct *vma, *prev, *cur;
+ struct vm_area_struct *vma, *cur;
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags, new_flags;
+ unsigned long vm_flags;
bool found;
bool basic_ioctls;
- unsigned long start, end, vma_end;
+ unsigned long start, end;
struct vma_iterator vmi;
bool wp_async = userfaultfd_wp_async_ctx(ctx);
@@ -1428,57 +1347,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
} for_each_vma_range(vmi, cur, end);
BUG_ON(!found);
- vma_iter_set(&vmi, start);
- prev = vma_prev(&vmi);
- if (vma->vm_start < start)
- prev = vma;
-
- ret = 0;
- for_each_vma_range(vmi, vma, end) {
- cond_resched();
-
- BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
- if (vma->vm_userfaultfd_ctx.ctx == ctx &&
- (vma->vm_flags & vm_flags) == vm_flags)
- goto skip;
-
- if (vma->vm_start > start)
- start = vma->vm_start;
- vma_end = min(end, vma->vm_end);
-
- new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags,
- (struct vm_userfaultfd_ctx){ctx});
- if (IS_ERR(vma)) {
- ret = PTR_ERR(vma);
- break;
- }
-
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx.ctx = ctx;
-
- if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
- hugetlb_unshare_all_pmds(vma);
-
- skip:
- prev = vma;
- start = vma->vm_end;
- }
+ ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
+ wp_async);
out_unlock:
mmap_write_unlock(mm);
@@ -1519,7 +1389,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_range uffdio_unregister;
- unsigned long new_flags;
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
@@ -1622,27 +1491,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, vma_end - start, false);
-
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
- new_flags, NULL_VM_UFFD_CTX);
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ start, vma_end);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
break;
}
- /*
- * In the vma_merge() successful mprotect-like case 8:
- * the next vma was merged into the current one and
- * the current one has not been updated yet.
- */
- vma_start_write(vma);
- userfaultfd_set_vm_flags(vma, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-
skip:
prev = vma;
start = vma->vm_end;
diff --git a/fs/utimes.c b/fs/utimes.c
index 3701b3946f88..99b26f792b89 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -115,9 +115,9 @@ static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
return -EINVAL;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- error = vfs_utimes(&f.file->f_path, times);
+ error = vfs_utimes(&fd_file(f)->f_path, times);
fdput(f);
return error;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 7672ce5486c5..05ec7e7d9e87 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -697,19 +697,19 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
int error;
CLASS(fd, f)(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- audit_file(f.file);
+ audit_file(fd_file(f));
error = setxattr_copy(name, &ctx);
if (error)
return error;
- error = mnt_want_write_file(f.file);
+ error = mnt_want_write_file(fd_file(f));
if (!error) {
- error = do_setxattr(file_mnt_idmap(f.file),
- f.file->f_path.dentry, &ctx);
- mnt_drop_write_file(f.file);
+ error = do_setxattr(file_mnt_idmap(fd_file(f)),
+ fd_file(f)->f_path.dentry, &ctx);
+ mnt_drop_write_file(fd_file(f));
}
kvfree(ctx.kvalue);
return error;
@@ -812,10 +812,10 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
struct fd f = fdget(fd);
ssize_t error = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
return error;
- audit_file(f.file);
- error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
+ audit_file(fd_file(f));
+ error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry,
name, value, size);
fdput(f);
return error;
@@ -888,10 +888,10 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
struct fd f = fdget(fd);
ssize_t error = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
return error;
- audit_file(f.file);
- error = listxattr(f.file->f_path.dentry, list, size);
+ audit_file(fd_file(f));
+ error = listxattr(fd_file(f)->f_path.dentry, list, size);
fdput(f);
return error;
}
@@ -954,9 +954,9 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
char kname[XATTR_NAME_MAX + 1];
int error = -EBADF;
- if (!f.file)
+ if (!fd_file(f))
return error;
- audit_file(f.file);
+ audit_file(fd_file(f));
error = strncpy_from_user(kname, name, sizeof(kname));
if (error == 0 || error == sizeof(kname))
@@ -964,11 +964,11 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
if (error < 0)
return error;
- error = mnt_want_write_file(f.file);
+ error = mnt_want_write_file(fd_file(f));
if (!error) {
- error = removexattr(file_mnt_idmap(f.file),
- f.file->f_path.dentry, kname);
- mnt_drop_write_file(f.file);
+ error = removexattr(file_mnt_idmap(fd_file(f)),
+ fd_file(f)->f_path.dentry, kname);
+ mnt_drop_write_file(fd_file(f));
}
fdput(f);
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6aaec1246c95..e50d913ad32f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -1138,10 +1138,7 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
- if (!tmpbuffer)
- return -ENOMEM;
-
+ tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1205,7 +1202,7 @@ xfs_attr3_leaf_to_shortform(
error = 0;
out:
- kfree(tmpbuffer);
+ kvfree(tmpbuffer);
return error;
}
@@ -1613,7 +1610,7 @@ xfs_attr3_leaf_compact(
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
+ tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1651,7 +1648,7 @@ xfs_attr3_leaf_compact(
*/
xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
- kfree(tmpbuffer);
+ kvfree(tmpbuffer);
}
/*
@@ -2330,7 +2327,7 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kzalloc(state->args->geo->blksize,
+ tmp_leaf = kvzalloc(state->args->geo->blksize,
GFP_KERNEL | __GFP_NOFAIL);
/*
@@ -2371,7 +2368,7 @@ xfs_attr3_leaf_unbalance(
}
memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
- kfree(tmp_leaf);
+ kvfree(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 20bb5ce38134..271855227514 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -3034,6 +3034,11 @@ xfs_ialloc_setup_geometry(
igeo->ialloc_align = mp->m_dalign;
else
igeo->ialloc_align = 0;
+
+ if (mp->m_sb.sb_blocksize > PAGE_SIZE)
+ igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
+ else
+ igeo->min_folio_order = 0;
}
/* Compute the location of the root directory inode that is laid out by mkfs. */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 2f7413afbf46..33b84a3a83ff 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -224,6 +224,9 @@ struct xfs_ino_geometry {
/* precomputed value for di_flags2 */
uint64_t new_diflags2;
+ /* minimum folio order of a page cache allocation */
+ unsigned int min_folio_order;
+
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 9b5d98fe1f8a..c753c79df203 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -126,7 +126,7 @@ xfile_load(
unsigned int len;
unsigned int offset;
- if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
SGP_READ) < 0)
break;
if (!folio) {
@@ -196,7 +196,7 @@ xfile_store(
unsigned int len;
unsigned int offset;
- if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
SGP_CACHE) < 0)
break;
if (filemap_check_wb_err(inode->i_mapping, 0)) {
@@ -267,7 +267,7 @@ xfile_get_folio(
i_size_write(inode, pos + len);
pflags = memalloc_nofs_save();
- error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
(flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
memalloc_nofs_restore(pflags);
if (error)
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 9bb2d24de709..07bebbfb16ee 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -149,7 +149,7 @@ xmbuf_map_page(
return -ENOMEM;
}
- error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+ error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE);
if (error)
return error;
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index d0889190ab7f..75cb53f090d1 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -829,9 +829,9 @@ xfs_ioc_exchange_range(
fxr.flags = args.flags;
file1 = fdget(args.file1_fd);
- if (!file1.file)
+ if (!fd_file(file1))
return -EBADF;
- fxr.file1 = file1.file;
+ fxr.file1 = fd_file(file1);
error = xfs_exchange_range(&fxr);
fdput(file1);
@@ -935,9 +935,9 @@ xfs_ioc_commit_range(
fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
file1 = fdget(args.file1_fd);
- if (!file1.file)
+ if (fd_empty(file1))
return -EBADF;
- fxr.file1 = file1.file;
+ fxr.file1 = fd_file(file1);
error = xfs_exchange_range(&fxr);
fdput(file1);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e97d789495a5..412b1d71b52b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -760,7 +760,7 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index cf5acbd3c7ca..49e5e5f04e60 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -85,16 +85,16 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = {NULL};
+ struct fd f = EMPTY_FD;
struct path path;
int error;
struct xfs_inode *ip;
if (cmd == XFS_IOC_FD_TO_HANDLE) {
f = fdget(hreq->fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- inode = file_inode(f.file);
+ inode = file_inode(fd_file(f));
} else {
error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 20d9924f28c2..a680e5b82672 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -100,7 +100,8 @@ xfs_inode_alloc(
/* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- mapping_set_large_folios(VFS_I(ip)->i_mapping);
+ mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
+ M_IGEO(mp)->min_folio_order);
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -360,7 +361,8 @@ xfs_reinit_inode(
inode->i_uid = uid;
inode->i_gid = gid;
inode->i_state = state;
- mapping_set_large_folios(inode->i_mapping);
+ mapping_set_folio_min_order(inode->i_mapping,
+ M_IGEO(mp)->min_folio_order);
return error;
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7226d27e8afc..a20d426ef021 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -886,33 +886,33 @@ xfs_ioc_swapext(
/* Pull information for the target fd */
f = fdget((int)sxp->sx_fdtarget);
- if (!f.file) {
+ if (!fd_file(f)) {
error = -EINVAL;
goto out;
}
- if (!(f.file->f_mode & FMODE_WRITE) ||
- !(f.file->f_mode & FMODE_READ) ||
- (f.file->f_flags & O_APPEND)) {
+ if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
+ !(fd_file(f)->f_mode & FMODE_READ) ||
+ (fd_file(f)->f_flags & O_APPEND)) {
error = -EBADF;
goto out_put_file;
}
tmp = fdget((int)sxp->sx_fdtmp);
- if (!tmp.file) {
+ if (!fd_file(tmp)) {
error = -EINVAL;
goto out_put_file;
}
- if (!(tmp.file->f_mode & FMODE_WRITE) ||
- !(tmp.file->f_mode & FMODE_READ) ||
- (tmp.file->f_flags & O_APPEND)) {
+ if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
+ !(fd_file(tmp)->f_mode & FMODE_READ) ||
+ (fd_file(tmp)->f_flags & O_APPEND)) {
error = -EBADF;
goto out_put_tmp_file;
}
- if (IS_SWAPFILE(file_inode(f.file)) ||
- IS_SWAPFILE(file_inode(tmp.file))) {
+ if (IS_SWAPFILE(file_inode(fd_file(f))) ||
+ IS_SWAPFILE(file_inode(fd_file(tmp)))) {
error = -EINVAL;
goto out_put_tmp_file;
}
@@ -922,14 +922,14 @@ xfs_ioc_swapext(
* before we cast and access them as XFS structures as we have no
* control over what the user passes us here.
*/
- if (f.file->f_op != &xfs_file_operations ||
- tmp.file->f_op != &xfs_file_operations) {
+ if (fd_file(f)->f_op != &xfs_file_operations ||
+ fd_file(tmp)->f_op != &xfs_file_operations) {
error = -EINVAL;
goto out_put_tmp_file;
}
- ip = XFS_I(file_inode(f.file));
- tip = XFS_I(file_inode(tmp.file));
+ ip = XFS_I(file_inode(fd_file(f)));
+ tip = XFS_I(file_inode(fd_file(tmp)));
if (ip->i_mount != tip->i_mount) {
error = -EINVAL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 72c981e3dc92..1e11f48814c0 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1208,14 +1208,14 @@ out_unlock:
return error;
}
-static int
+static void
xfs_buffered_write_delalloc_punch(
struct inode *inode,
loff_t offset,
- loff_t length)
+ loff_t length,
+ struct iomap *iomap)
{
xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length);
- return 0;
}
static int
@@ -1227,17 +1227,8 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
-
- struct xfs_mount *mp = XFS_M(inode->i_sb);
- int error;
-
- error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
- length, written, &xfs_buffered_write_delalloc_punch);
- if (error && !xfs_is_shutdown(mp)) {
- xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
- __func__, XFS_I(inode)->i_ino);
- return error;
- }
+ iomap_file_buffered_write_punch_delalloc(inode, offset, length, written,
+ flags, iomap, &xfs_buffered_write_delalloc_punch);
return 0;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 1cdc8034f54d..ee79cf161312 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -567,7 +567,7 @@ xfs_stat_blksize(
return 1U << mp->m_allocsize_log;
}
- return PAGE_SIZE;
+ return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
STATIC int
@@ -870,16 +870,6 @@ xfs_setattr_size(
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
} else {
- /*
- * iomap won't detect a dirty page over an unwritten block (or a
- * cow block over a hole) and subsequently skips zeroing the
- * newly post-EOF portion of the page. Flush the new EOF to
- * convert the block before the pagecache truncate.
- */
- error = filemap_write_and_wait_range(inode->i_mapping, newsize,
- newsize);
- if (error)
- return error;
error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1a74fe22672e..ec766b4bc853 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2128,7 +2128,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+ ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 460f93a9ce00..1fdd79c5bfa0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -132,11 +132,15 @@ xfs_sb_validate_fsb_count(
xfs_sb_t *sbp,
uint64_t nblocks)
{
- ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+ uint64_t max_bytes;
+
ASSERT(sbp->sb_blocklog >= BBSHIFT);
+ if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes))
+ return -EFBIG;
+
/* Limited by ULONG_MAX of page cache index */
- if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+ if (max_bytes >> PAGE_SHIFT > ULONG_MAX)
return -EFBIG;
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26767745d9fd..fbb3a1594c0d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1638,16 +1638,28 @@ xfs_fs_fill_super(
goto out_free_sb;
}
- /*
- * Until this is fixed only page-sized or smaller data blocks work.
- */
if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
- xfs_warn(mp,
- "File system with blocksize %d bytes. "
- "Only pagesize (%ld) or less will currently work.",
+ size_t max_folio_size = mapping_max_folio_size_supported();
+
+ if (!xfs_has_crc(mp)) {
+ xfs_warn(mp,
+"V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.",
mp->m_sb.sb_blocksize, PAGE_SIZE);
- error = -ENOSYS;
- goto out_free_sb;
+ error = -ENOSYS;
+ goto out_free_sb;
+ }
+
+ if (mp->m_sb.sb_blocksize > max_folio_size) {
+ xfs_warn(mp,
+"block size (%u bytes) not supported; Only block size (%zu) or less is supported",
+ mp->m_sb.sb_blocksize, max_folio_size);
+ error = -ENOSYS;
+ goto out_free_sb;
+ }
+
+ xfs_warn(mp,
+"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
+ mp->m_sb.sb_blocksize);
}
/* Ensure this filesystem fits in the page cache limits */
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 3b103715acc9..35166c92420c 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -563,7 +563,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
if (ret == -EIO)
zonefs_io_error(inode, true);
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
index 8ccb65c2b419..ff9a688f1f9c 100644
--- a/fs/zonefs/sysfs.c
+++ b/fs/zonefs/sysfs.c
@@ -92,6 +92,7 @@ int zonefs_sysfs_register(struct super_block *sb)
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
int ret;
+ super_set_sysfs_name_id(sb);
init_completion(&sbi->s_kobj_unregister);
ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype,
zonefs_sysfs_root, "%s", sb->s_id);